diff --git a/utils/PyRSS2Gen.py.bak b/utils/PyRSS2Gen.py.bak deleted file mode 100644 index fc1f1cf24..000000000 --- a/utils/PyRSS2Gen.py.bak +++ /dev/null @@ -1,443 +0,0 @@ -"""PyRSS2Gen - A Python library for generating RSS 2.0 feeds.""" - -__name__ = "PyRSS2Gen" -__version__ = (1, 0, 0) -__author__ = "Andrew Dalke " - -_generator_name = __name__ + "-" + ".".join(map(str, __version__)) - -import datetime - -# Could make this the base class; will need to add 'publish' -class WriteXmlMixin: - def write_xml(self, outfile, encoding = "iso-8859-1"): - from xml.sax import saxutils - handler = saxutils.XMLGenerator(outfile, encoding) - handler.startDocument() - self.publish(handler) - handler.endDocument() - - def to_xml(self, encoding = "iso-8859-1"): - try: - import cStringIO as StringIO - except ImportError: - import StringIO - f = StringIO.StringIO() - self.write_xml(f, encoding) - return f.getvalue() - - -def _element(handler, name, obj, d = {}): - if isinstance(obj, basestring) or obj is None: - # special-case handling to make the API easier - # to use for the common case. - handler.startElement(name, d) - if obj is not None: - handler.characters(obj) - handler.endElement(name) - else: - # It better know how to emit the correct XML. - obj.publish(handler) - -def _opt_element(handler, name, obj): - if obj is None: - return - _element(handler, name, obj) - - -def _format_date(dt): - """convert a datetime into an RFC 822 formatted date - - Input date must be in GMT. - """ - # Looks like: - # Sat, 07 Sep 2002 00:00:01 GMT - # Can't use strftime because that's locale dependent - # - # Isn't there a standard way to do this for Python? The - # rfc822 and email.Utils modules assume a timestamp. The - # following is based on the rfc822 module. - return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( - ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()], - dt.day, - ["Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1], - dt.year, dt.hour, dt.minute, dt.second) - - -## -# A couple simple wrapper objects for the fields which -# take a simple value other than a string. -class IntElement: - """implements the 'publish' API for integers - - Takes the tag name and the integer value to publish. - - (Could be used for anything which uses str() to be published - to text for XML.) - """ - element_attrs = {} - def __init__(self, name, val): - self.name = name - self.val = val - def publish(self, handler): - handler.startElement(self.name, self.element_attrs) - handler.characters(str(self.val)) - handler.endElement(self.name) - -class DateElement: - """implements the 'publish' API for a datetime.datetime - - Takes the tag name and the datetime to publish. - - Converts the datetime to RFC 2822 timestamp (4-digit year). - """ - def __init__(self, name, dt): - self.name = name - self.dt = dt - def publish(self, handler): - _element(handler, self.name, _format_date(self.dt)) -#### - -class Category: - """Publish a category element""" - def __init__(self, category, domain = None): - self.category = category - self.domain = domain - def publish(self, handler): - d = {} - if self.domain is not None: - d["domain"] = self.domain - _element(handler, "category", self.category, d) - -class Cloud: - """Publish a cloud""" - def __init__(self, domain, port, path, - registerProcedure, protocol): - self.domain = domain - self.port = port - self.path = path - self.registerProcedure = registerProcedure - self.protocol = protocol - def publish(self, handler): - _element(handler, "cloud", None, { - "domain": self.domain, - "port": str(self.port), - "path": self.path, - "registerProcedure": self.registerProcedure, - "protocol": self.protocol}) - -class Image: - """Publish a channel Image""" - element_attrs = {} - def __init__(self, url, title, link, - width = None, height = None, description = None): - self.url = url - self.title = title - self.link = link - self.width = width - self.height = height - self.description = description - - def publish(self, handler): - handler.startElement("image", self.element_attrs) - - _element(handler, "url", self.url) - _element(handler, "title", self.title) - _element(handler, "link", self.link) - - width = self.width - if isinstance(width, int): - width = IntElement("width", width) - _opt_element(handler, "width", width) - - height = self.height - if isinstance(height, int): - height = IntElement("height", height) - _opt_element(handler, "height", height) - - _opt_element(handler, "description", self.description) - - handler.endElement("image") - -class Guid: - """Publish a guid - - Defaults to being a permalink, which is the assumption if it's - omitted. Hence strings are always permalinks. - """ - def __init__(self, guid, isPermaLink = 1): - self.guid = guid - self.isPermaLink = isPermaLink - def publish(self, handler): - d = {} - if self.isPermaLink: - d["isPermaLink"] = "true" - else: - d["isPermaLink"] = "false" - _element(handler, "guid", self.guid, d) - -class TextInput: - """Publish a textInput - - Apparently this is rarely used. - """ - element_attrs = {} - def __init__(self, title, description, name, link): - self.title = title - self.description = description - self.name = name - self.link = link - - def publish(self, handler): - handler.startElement("textInput", self.element_attrs) - _element(handler, "title", self.title) - _element(handler, "description", self.description) - _element(handler, "name", self.name) - _element(handler, "link", self.link) - handler.endElement("textInput") - - -class Enclosure: - """Publish an enclosure""" - def __init__(self, url, length, type): - self.url = url - self.length = length - self.type = type - def publish(self, handler): - _element(handler, "enclosure", None, - {"url": self.url, - "length": str(self.length), - "type": self.type, - }) - -class Source: - """Publish the item's original source, used by aggregators""" - def __init__(self, name, url): - self.name = name - self.url = url - def publish(self, handler): - _element(handler, "source", self.name, {"url": self.url}) - -class SkipHours: - """Publish the skipHours - - This takes a list of hours, as integers. - """ - element_attrs = {} - def __init__(self, hours): - self.hours = hours - def publish(self, handler): - if self.hours: - handler.startElement("skipHours", self.element_attrs) - for hour in self.hours: - _element(handler, "hour", str(hour)) - handler.endElement("skipHours") - -class SkipDays: - """Publish the skipDays - - This takes a list of days as strings. - """ - element_attrs = {} - def __init__(self, days): - self.days = days - def publish(self, handler): - if self.days: - handler.startElement("skipDays", self.element_attrs) - for day in self.days: - _element(handler, "day", day) - handler.endElement("skipDays") - -class RSS2(WriteXmlMixin): - """The main RSS class. - - Stores the channel attributes, with the "category" elements under - ".categories" and the RSS items under ".items". - """ - - rss_attrs = {"version": "2.0"} - element_attrs = {} - def __init__(self, - title, - link, - description, - - language = None, - copyright = None, - managingEditor = None, - webMaster = None, - pubDate = None, # a datetime, *in* *GMT* - lastBuildDate = None, # a datetime - - categories = None, # list of strings or Category - generator = _generator_name, - docs = "http://blogs.law.harvard.edu/tech/rss", - cloud = None, # a Cloud - ttl = None, # integer number of minutes - - image = None, # an Image - rating = None, # a string; I don't know how it's used - textInput = None, # a TextInput - skipHours = None, # a SkipHours with a list of integers - skipDays = None, # a SkipDays with a list of strings - - items = None, # list of RSSItems - ): - self.title = title - self.link = link - self.description = description - self.language = language - self.copyright = copyright - self.managingEditor = managingEditor - - self.webMaster = webMaster - self.pubDate = pubDate - self.lastBuildDate = lastBuildDate - - if categories is None: - categories = [] - self.categories = categories - self.generator = generator - self.docs = docs - self.cloud = cloud - self.ttl = ttl - self.image = image - self.rating = rating - self.textInput = textInput - self.skipHours = skipHours - self.skipDays = skipDays - - if items is None: - items = [] - self.items = items - - def publish(self, handler): - handler.startElement("rss", self.rss_attrs) - handler.startElement("channel", self.element_attrs) - _element(handler, "title", self.title) - _element(handler, "link", self.link) - _element(handler, "description", self.description) - - self.publish_extensions(handler) - - _opt_element(handler, "language", self.language) - _opt_element(handler, "copyright", self.copyright) - _opt_element(handler, "managingEditor", self.managingEditor) - _opt_element(handler, "webMaster", self.webMaster) - - pubDate = self.pubDate - if isinstance(pubDate, datetime.datetime): - pubDate = DateElement("pubDate", pubDate) - _opt_element(handler, "pubDate", pubDate) - - lastBuildDate = self.lastBuildDate - if isinstance(lastBuildDate, datetime.datetime): - lastBuildDate = DateElement("lastBuildDate", lastBuildDate) - _opt_element(handler, "lastBuildDate", lastBuildDate) - - for category in self.categories: - if isinstance(category, basestring): - category = Category(category) - category.publish(handler) - - _opt_element(handler, "generator", self.generator) - _opt_element(handler, "docs", self.docs) - - if self.cloud is not None: - self.cloud.publish(handler) - - ttl = self.ttl - if isinstance(self.ttl, int): - ttl = IntElement("ttl", ttl) - _opt_element(handler, "tt", ttl) - - if self.image is not None: - self.image.publish(handler) - - _opt_element(handler, "rating", self.rating) - if self.textInput is not None: - self.textInput.publish(handler) - if self.skipHours is not None: - self.skipHours.publish(handler) - if self.skipDays is not None: - self.skipDays.publish(handler) - - for item in self.items: - item.publish(handler) - - handler.endElement("channel") - handler.endElement("rss") - - def publish_extensions(self, handler): - # Derived classes can hook into this to insert - # output after the three required fields. - pass - - - -class RSSItem(WriteXmlMixin): - """Publish an RSS Item""" - element_attrs = {} - def __init__(self, - title = None, # string - link = None, # url as string - description = None, # string - author = None, # email address as string - categories = None, # list of string or Category - comments = None, # url as string - enclosure = None, # an Enclosure - guid = None, # a unique string - pubDate = None, # a datetime - source = None, # a Source - ): - - if title is None and description is None: - raise TypeError( - "must define at least one of 'title' or 'description'") - self.title = title - self.link = link - self.description = description - self.author = author - if categories is None: - categories = [] - self.categories = categories - self.comments = comments - self.enclosure = enclosure - self.guid = guid - self.pubDate = pubDate - self.source = source - # It sure does get tedious typing these names three times... - - def publish(self, handler): - handler.startElement("item", self.element_attrs) - _opt_element(handler, "title", self.title) - _opt_element(handler, "link", self.link) - self.publish_extensions(handler) - _opt_element(handler, "description", self.description) - _opt_element(handler, "author", self.author) - - for category in self.categories: - if isinstance(category, basestring): - category = Category(category) - category.publish(handler) - - _opt_element(handler, "comments", self.comments) - if self.enclosure is not None: - self.enclosure.publish(handler) - _opt_element(handler, "guid", self.guid) - - pubDate = self.pubDate - if isinstance(pubDate, datetime.datetime): - pubDate = DateElement("pubDate", pubDate) - _opt_element(handler, "pubDate", pubDate) - - if self.source is not None: - self.source.publish(handler) - - handler.endElement("item") - - def publish_extensions(self, handler): - # Derived classes can hook into this to insert - # output after the title and link elements - pass diff --git a/utils/S3.py.bak b/utils/S3.py.bak deleted file mode 100644 index 77691e668..000000000 --- a/utils/S3.py.bak +++ /dev/null @@ -1,617 +0,0 @@ -#!/usr/bin/env python - -# This software code is made available "AS IS" without warranties of any -# kind. You may copy, display, modify and redistribute the software -# code either by itself or as incorporated into your code; provided that -# you do not remove any proprietary notices. Your use of this software -# code is at your own risk and you waive any claim against Amazon -# Digital Services, Inc. or its affiliates with respect to your use of -# this software code. (c) 2006-2007 Amazon Digital Services, Inc. or its -# affiliates. - -import base64 -import hmac -import httplib -import re -import sha -import sys -import time -import urllib -import urlparse -import xml.sax - -DEFAULT_HOST = 's3.amazonaws.com' -PORTS_BY_SECURITY = { True: 443, False: 80 } -METADATA_PREFIX = 'x-amz-meta-' -AMAZON_HEADER_PREFIX = 'x-amz-' - -# generates the aws canonical string for the given parameters -def canonical_string(method, bucket="", key="", query_args={}, headers={}, expires=None): - interesting_headers = {} - for header_key in headers: - lk = header_key.lower() - if lk in ['content-md5', 'content-type', 'date'] or lk.startswith(AMAZON_HEADER_PREFIX): - interesting_headers[lk] = headers[header_key].strip() - - # these keys get empty strings if they don't exist - if not interesting_headers.has_key('content-type'): - interesting_headers['content-type'] = '' - if not interesting_headers.has_key('content-md5'): - interesting_headers['content-md5'] = '' - - # just in case someone used this. it's not necessary in this lib. - if interesting_headers.has_key('x-amz-date'): - interesting_headers['date'] = '' - - # if you're using expires for query string auth, then it trumps date - # (and x-amz-date) - if expires: - interesting_headers['date'] = str(expires) - - sorted_header_keys = interesting_headers.keys() - sorted_header_keys.sort() - - buf = "%s\n" % method - for header_key in sorted_header_keys: - if header_key.startswith(AMAZON_HEADER_PREFIX): - buf += "%s:%s\n" % (header_key, interesting_headers[header_key]) - else: - buf += "%s\n" % interesting_headers[header_key] - - # append the bucket if it exists - if bucket != "": - buf += "/%s" % bucket - - # add the key. even if it doesn't exist, add the slash - buf += "/%s" % urllib.quote_plus(key) - - # handle special query string arguments - - if query_args.has_key("acl"): - buf += "?acl" - elif query_args.has_key("torrent"): - buf += "?torrent" - elif query_args.has_key("logging"): - buf += "?logging" - elif query_args.has_key("location"): - buf += "?location" - - return buf - -# computes the base64'ed hmac-sha hash of the canonical string and the secret -# access key, optionally urlencoding the result -def encode(aws_secret_access_key, str, urlencode=False): - b64_hmac = base64.encodestring(hmac.new(aws_secret_access_key, str, sha).digest()).strip() - if urlencode: - return urllib.quote_plus(b64_hmac) - else: - return b64_hmac - -def merge_meta(headers, metadata): - final_headers = headers.copy() - for k in metadata.keys(): - final_headers[METADATA_PREFIX + k] = metadata[k] - - return final_headers - -# builds the query arg string -def query_args_hash_to_string(query_args): - query_string = "" - pairs = [] - for k, v in query_args.items(): - piece = k - if v != None: - piece += "=%s" % urllib.quote_plus(str(v)) - pairs.append(piece) - - return '&'.join(pairs) - - -class CallingFormat: - PATH = 1 - SUBDOMAIN = 2 - VANITY = 3 - - def build_url_base(protocol, server, port, bucket, calling_format): - url_base = '%s://' % protocol - - if bucket == '': - url_base += server - elif calling_format == CallingFormat.SUBDOMAIN: - url_base += "%s.%s" % (bucket, server) - elif calling_format == CallingFormat.VANITY: - url_base += bucket - else: - url_base += server - - url_base += ":%s" % port - - if (bucket != '') and (calling_format == CallingFormat.PATH): - url_base += "/%s" % bucket - - return url_base - - build_url_base = staticmethod(build_url_base) - - - -class Location: - DEFAULT = None - EU = 'EU' - - - -class AWSAuthConnection: - def __init__(self, aws_access_key_id, aws_secret_access_key, is_secure=True, - server=DEFAULT_HOST, port=None, calling_format=CallingFormat.SUBDOMAIN): - - if not port: - port = PORTS_BY_SECURITY[is_secure] - - self.aws_access_key_id = aws_access_key_id - self.aws_secret_access_key = aws_secret_access_key - self.is_secure = is_secure - self.server = server - self.port = port - self.calling_format = calling_format - - def create_bucket(self, bucket, headers={}): - return Response(self._make_request('PUT', bucket, '', {}, headers)) - - def create_located_bucket(self, bucket, location=Location.DEFAULT, headers={}): - if location == Location.DEFAULT: - body = "" - else: - body = "" + \ - location + \ - "" - return Response(self._make_request('PUT', bucket, '', {}, headers, body)) - - def check_bucket_exists(self, bucket): - return self._make_request('HEAD', bucket, '', {}, {}) - - def list_bucket(self, bucket, options={}, headers={}): - return ListBucketResponse(self._make_request('GET', bucket, '', options, headers)) - - def delete_bucket(self, bucket, headers={}): - return Response(self._make_request('DELETE', bucket, '', {}, headers)) - - def put(self, bucket, key, object, headers={}): - if not isinstance(object, S3Object): - object = S3Object(object) - - return Response( - self._make_request( - 'PUT', - bucket, - key, - {}, - headers, - object.data, - object.metadata)) - - def get(self, bucket, key, headers={}): - return GetResponse( - self._make_request('GET', bucket, key, {}, headers)) - - def delete(self, bucket, key, headers={}): - return Response( - self._make_request('DELETE', bucket, key, {}, headers)) - - def get_bucket_logging(self, bucket, headers={}): - return GetResponse(self._make_request('GET', bucket, '', { 'logging': None }, headers)) - - def put_bucket_logging(self, bucket, logging_xml_doc, headers={}): - return Response(self._make_request('PUT', bucket, '', { 'logging': None }, headers, logging_xml_doc)) - - def get_bucket_acl(self, bucket, headers={}): - return self.get_acl(bucket, '', headers) - - def get_acl(self, bucket, key, headers={}): - return GetResponse( - self._make_request('GET', bucket, key, { 'acl': None }, headers)) - - def put_bucket_acl(self, bucket, acl_xml_document, headers={}): - return self.put_acl(bucket, '', acl_xml_document, headers) - - def put_acl(self, bucket, key, acl_xml_document, headers={}): - return Response( - self._make_request( - 'PUT', - bucket, - key, - { 'acl': None }, - headers, - acl_xml_document)) - - def list_all_my_buckets(self, headers={}): - return ListAllMyBucketsResponse(self._make_request('GET', '', '', {}, headers)) - - def get_bucket_location(self, bucket): - return LocationResponse(self._make_request('GET', bucket, '', {'location' : None})) - - # end public methods - - def _make_request(self, method, bucket='', key='', query_args={}, headers={}, data='', metadata={}): - - server = '' - if bucket == '': - server = self.server - elif self.calling_format == CallingFormat.SUBDOMAIN: - server = "%s.%s" % (bucket, self.server) - elif self.calling_format == CallingFormat.VANITY: - server = bucket - else: - server = self.server - - path = '' - - if (bucket != '') and (self.calling_format == CallingFormat.PATH): - path += "/%s" % bucket - - # add the slash after the bucket regardless - # the key will be appended if it is non-empty - path += "/%s" % urllib.quote_plus(key) - - - # build the path_argument string - # add the ? in all cases since - # signature and credentials follow path args - if len(query_args): - path += "?" + query_args_hash_to_string(query_args) - - is_secure = self.is_secure - host = "%s:%d" % (server, self.port) - while True: - if (is_secure): - connection = httplib.HTTPSConnection(host) - else: - connection = httplib.HTTPConnection(host) - - final_headers = merge_meta(headers, metadata); - # add auth header - self._add_aws_auth_header(final_headers, method, bucket, key, query_args) - - connection.request(method, path, data, final_headers) - resp = connection.getresponse() - if resp.status < 300 or resp.status >= 400: - return resp - # handle redirect - location = resp.getheader('location') - if not location: - return resp - # (close connection) - resp.read() - scheme, host, path, params, query, fragment \ - = urlparse.urlparse(location) - if scheme == "http": is_secure = True - elif scheme == "https": is_secure = False - else: raise invalidURL("Not http/https: " + location) - if query: path += "?" + query - # retry with redirect - - def _add_aws_auth_header(self, headers, method, bucket, key, query_args): - if not headers.has_key('Date'): - headers['Date'] = time.strftime("%a, %d %b %Y %X GMT", time.gmtime()) - - c_string = canonical_string(method, bucket, key, query_args, headers) - headers['Authorization'] = \ - "AWS %s:%s" % (self.aws_access_key_id, encode(self.aws_secret_access_key, c_string)) - - -class QueryStringAuthGenerator: - # by default, expire in 1 minute - DEFAULT_EXPIRES_IN = 60 - - def __init__(self, aws_access_key_id, aws_secret_access_key, is_secure=True, - server=DEFAULT_HOST, port=None, calling_format=CallingFormat.SUBDOMAIN): - - if not port: - port = PORTS_BY_SECURITY[is_secure] - - self.aws_access_key_id = aws_access_key_id - self.aws_secret_access_key = aws_secret_access_key - if (is_secure): - self.protocol = 'https' - else: - self.protocol = 'http' - - self.is_secure = is_secure - self.server = server - self.port = port - self.calling_format = calling_format - self.__expires_in = QueryStringAuthGenerator.DEFAULT_EXPIRES_IN - self.__expires = None - - # for backwards compatibility with older versions - self.server_name = "%s:%s" % (self.server, self.port) - - def set_expires_in(self, expires_in): - self.__expires_in = expires_in - self.__expires = None - - def set_expires(self, expires): - self.__expires = expires - self.__expires_in = None - - def create_bucket(self, bucket, headers={}): - return self.generate_url('PUT', bucket, '', {}, headers) - - def list_bucket(self, bucket, options={}, headers={}): - return self.generate_url('GET', bucket, '', options, headers) - - def delete_bucket(self, bucket, headers={}): - return self.generate_url('DELETE', bucket, '', {}, headers) - - def put(self, bucket, key, object, headers={}): - if not isinstance(object, S3Object): - object = S3Object(object) - - return self.generate_url( - 'PUT', - bucket, - key, - {}, - merge_meta(headers, object.metadata)) - - def get(self, bucket, key, headers={}): - return self.generate_url('GET', bucket, key, {}, headers) - - def delete(self, bucket, key, headers={}): - return self.generate_url('DELETE', bucket, key, {}, headers) - - def get_bucket_logging(self, bucket, headers={}): - return self.generate_url('GET', bucket, '', { 'logging': None }, headers) - - def put_bucket_logging(self, bucket, logging_xml_doc, headers={}): - return self.generate_url('PUT', bucket, '', { 'logging': None }, headers) - - def get_bucket_acl(self, bucket, headers={}): - return self.get_acl(bucket, '', headers) - - def get_acl(self, bucket, key='', headers={}): - return self.generate_url('GET', bucket, key, { 'acl': None }, headers) - - def put_bucket_acl(self, bucket, acl_xml_document, headers={}): - return self.put_acl(bucket, '', acl_xml_document, headers) - - # don't really care what the doc is here. - def put_acl(self, bucket, key, acl_xml_document, headers={}): - return self.generate_url('PUT', bucket, key, { 'acl': None }, headers) - - def list_all_my_buckets(self, headers={}): - return self.generate_url('GET', '', '', {}, headers) - - def make_bare_url(self, bucket, key=''): - full_url = self.generate_url(self, bucket, key) - return full_url[:full_url.index('?')] - - def generate_url(self, method, bucket='', key='', query_args={}, headers={}): - expires = 0 - if self.__expires_in != None: - expires = int(time.time() + self.__expires_in) - elif self.__expires != None: - expires = int(self.__expires) - else: - raise "Invalid expires state" - - canonical_str = canonical_string(method, bucket, key, query_args, headers, expires) - encoded_canonical = encode(self.aws_secret_access_key, canonical_str) - - url = CallingFormat.build_url_base(self.protocol, self.server, self.port, bucket, self.calling_format) - - url += "/%s" % urllib.quote_plus(key) - - query_args['Signature'] = encoded_canonical - query_args['Expires'] = expires - query_args['AWSAccessKeyId'] = self.aws_access_key_id - - url += "?%s" % query_args_hash_to_string(query_args) - - return url - - -class S3Object: - def __init__(self, data, metadata={}): - self.data = data - self.metadata = metadata - -class Owner: - def __init__(self, id='', display_name=''): - self.id = id - self.display_name = display_name - -class ListEntry: - def __init__(self, key='', last_modified=None, etag='', size=0, storage_class='', owner=None): - self.key = key - self.last_modified = last_modified - self.etag = etag - self.size = size - self.storage_class = storage_class - self.owner = owner - -class CommonPrefixEntry: - def __init(self, prefix=''): - self.prefix = prefix - -class Bucket: - def __init__(self, name='', creation_date=''): - self.name = name - self.creation_date = creation_date - -class Response: - def __init__(self, http_response): - self.http_response = http_response - # you have to do this read, even if you don't expect a body. - # otherwise, the next request fails. - self.body = http_response.read() - if http_response.status >= 300 and self.body: - self.message = self.body - else: - self.message = "%03d %s" % (http_response.status, http_response.reason) - - - -class ListBucketResponse(Response): - def __init__(self, http_response): - Response.__init__(self, http_response) - if http_response.status < 300: - handler = ListBucketHandler() - xml.sax.parseString(self.body, handler) - self.entries = handler.entries - self.common_prefixes = handler.common_prefixes - self.name = handler.name - self.marker = handler.marker - self.prefix = handler.prefix - self.is_truncated = handler.is_truncated - self.delimiter = handler.delimiter - self.max_keys = handler.max_keys - self.next_marker = handler.next_marker - else: - self.entries = [] - -class ListAllMyBucketsResponse(Response): - def __init__(self, http_response): - Response.__init__(self, http_response) - if http_response.status < 300: - handler = ListAllMyBucketsHandler() - xml.sax.parseString(self.body, handler) - self.entries = handler.entries - else: - self.entries = [] - -class GetResponse(Response): - def __init__(self, http_response): - Response.__init__(self, http_response) - response_headers = http_response.msg # older pythons don't have getheaders - metadata = self.get_aws_metadata(response_headers) - self.object = S3Object(self.body, metadata) - - def get_aws_metadata(self, headers): - metadata = {} - for hkey in headers.keys(): - if hkey.lower().startswith(METADATA_PREFIX): - metadata[hkey[len(METADATA_PREFIX):]] = headers[hkey] - del headers[hkey] - - return metadata - -class LocationResponse(Response): - def __init__(self, http_response): - Response.__init__(self, http_response) - if http_response.status < 300: - handler = LocationHandler() - xml.sax.parseString(self.body, handler) - self.location = handler.location - -class ListBucketHandler(xml.sax.ContentHandler): - def __init__(self): - self.entries = [] - self.curr_entry = None - self.curr_text = '' - self.common_prefixes = [] - self.curr_common_prefix = None - self.name = '' - self.marker = '' - self.prefix = '' - self.is_truncated = False - self.delimiter = '' - self.max_keys = 0 - self.next_marker = '' - self.is_echoed_prefix_set = False - - def startElement(self, name, attrs): - if name == 'Contents': - self.curr_entry = ListEntry() - elif name == 'Owner': - self.curr_entry.owner = Owner() - elif name == 'CommonPrefixes': - self.curr_common_prefix = CommonPrefixEntry() - - - def endElement(self, name): - if name == 'Contents': - self.entries.append(self.curr_entry) - elif name == 'CommonPrefixes': - self.common_prefixes.append(self.curr_common_prefix) - elif name == 'Key': - self.curr_entry.key = self.curr_text - elif name == 'LastModified': - self.curr_entry.last_modified = self.curr_text - elif name == 'ETag': - self.curr_entry.etag = self.curr_text - elif name == 'Size': - self.curr_entry.size = int(self.curr_text) - elif name == 'ID': - self.curr_entry.owner.id = self.curr_text - elif name == 'DisplayName': - self.curr_entry.owner.display_name = self.curr_text - elif name == 'StorageClass': - self.curr_entry.storage_class = self.curr_text - elif name == 'Name': - self.name = self.curr_text - elif name == 'Prefix' and self.is_echoed_prefix_set: - self.curr_common_prefix.prefix = self.curr_text - elif name == 'Prefix': - self.prefix = self.curr_text - self.is_echoed_prefix_set = True - elif name == 'Marker': - self.marker = self.curr_text - elif name == 'IsTruncated': - self.is_truncated = self.curr_text == 'true' - elif name == 'Delimiter': - self.delimiter = self.curr_text - elif name == 'MaxKeys': - self.max_keys = int(self.curr_text) - elif name == 'NextMarker': - self.next_marker = self.curr_text - - self.curr_text = '' - - def characters(self, content): - self.curr_text += content - - -class ListAllMyBucketsHandler(xml.sax.ContentHandler): - def __init__(self): - self.entries = [] - self.curr_entry = None - self.curr_text = '' - - def startElement(self, name, attrs): - if name == 'Bucket': - self.curr_entry = Bucket() - - def endElement(self, name): - if name == 'Name': - self.curr_entry.name = self.curr_text - elif name == 'CreationDate': - self.curr_entry.creation_date = self.curr_text - elif name == 'Bucket': - self.entries.append(self.curr_entry) - - def characters(self, content): - self.curr_text = content - - -class LocationHandler(xml.sax.ContentHandler): - def __init__(self): - self.location = None - self.state = 'init' - - def startElement(self, name, attrs): - if self.state == 'init': - if name == 'LocationConstraint': - self.state = 'tag_location' - self.location = '' - else: self.state = 'bad' - else: self.state = 'bad' - - def endElement(self, name): - if self.state == 'tag_location' and name == 'LocationConstraint': - self.state = 'done' - else: self.state = 'bad' - - def characters(self, content): - if self.state == 'tag_location': - self.location += content diff --git a/utils/exception_middleware.py.bak b/utils/exception_middleware.py.bak deleted file mode 100644 index 6d043986e..000000000 --- a/utils/exception_middleware.py.bak +++ /dev/null @@ -1,16 +0,0 @@ -import traceback -import sys -import inspect -from pprint import pprint - -class ConsoleExceptionMiddleware: - def process_exception(self, request, exception): - exc_info = sys.exc_info() - print("######################## Exception #############################") - print('\n'.join(traceback.format_exception(*(exc_info or sys.exc_info())))) - print("----------------------------------------------------------------") - pprint(inspect.trace()[-1][0].f_locals) - print("################################################################") - - #pprint(request) - #print "################################################################" diff --git a/utils/facebook_fetcher.py.bak b/utils/facebook_fetcher.py.bak deleted file mode 100644 index 06bba8a56..000000000 --- a/utils/facebook_fetcher.py.bak +++ /dev/null @@ -1,224 +0,0 @@ -import re -import datetime -import dateutil.parser -from django.conf import settings -from django.utils import feedgenerator -from django.utils.html import linebreaks -from apps.social.models import MSocialServices -from apps.reader.models import UserSubscription -from utils import log as logging -from vendor.facebook import GraphAPIError - -class FacebookFetcher: - - def __init__(self, feed, options=None): - self.feed = feed - self.options = options or {} - - def fetch(self): - page_name = self.extract_page_name() - if not page_name: - return - - facebook_user = self.facebook_user() - if not facebook_user: - return - - # If 'video', use video API to get embed: - # f.get_object('tastyvegetarian', fields='posts') - # f.get_object('1992797300790726', fields='embed_html') - feed = self.fetch_page_feed(facebook_user, page_name, 'name,about,posts,videos,photos') - - data = {} - data['title'] = feed.get('name', "%s on Facebook" % page_name) - data['link'] = feed.get('link', "https://facebook.com/%s" % page_name) - data['description'] = feed.get('about', "%s on Facebook" % page_name) - data['lastBuildDate'] = datetime.datetime.utcnow() - data['generator'] = 'NewsBlur Facebook API Decrapifier - %s' % settings.NEWSBLUR_URL - data['docs'] = None - data['feed_url'] = self.feed.feed_address - rss = feedgenerator.Atom1Feed(**data) - merged_data = [] - - posts = feed.get('posts', {}).get('data', None) - if posts: - for post in posts: - story_data = self.page_posts_story(facebook_user, post) - if not story_data: - continue - merged_data.append(story_data) - - videos = feed.get('videos', {}).get('data', None) - if videos: - for video in videos: - story_data = self.page_video_story(facebook_user, video) - if not story_data: - continue - for seen_data in merged_data: - if story_data['link'] == seen_data['link']: - # Video wins over posts (and attachments) - seen_data['description'] = story_data['description'] - seen_data['title'] = story_data['title'] - break - - for story_data in merged_data: - rss.add_item(**story_data) - - return rss.writeString('utf-8') - - def extract_page_name(self): - page = None - try: - page_groups = re.search('facebook.com/(\w+)/?', self.feed.feed_address) - if not page_groups: - return - page = page_groups.group(1) - except IndexError: - return - - return page - - def facebook_user(self): - facebook_api = None - social_services = None - - if self.options.get('requesting_user_id', None): - social_services = MSocialServices.get_user(self.options.get('requesting_user_id')) - facebook_api = social_services.facebook_api() - if not facebook_api: - logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' % - (self.feed.log_title[:30], self.feed.feed_address, self.options)) - return - else: - usersubs = UserSubscription.objects.filter(feed=self.feed) - if not usersubs: - logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' % - (self.feed.log_title[:30], self.feed.feed_address)) - return - - for sub in usersubs: - social_services = MSocialServices.get_user(sub.user_id) - if not social_services.facebook_uid: - continue - - facebook_api = social_services.facebook_api() - if not facebook_api: - continue - else: - break - - if not facebook_api: - logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' % - (self.feed.log_title[:30], self.feed.feed_address, usersubs[0].user.username)) - return - - return facebook_api - - def fetch_page_feed(self, facebook_user, page, fields): - try: - stories = facebook_user.get_object(page, fields=fields) - except GraphAPIError, e: - message = str(e).lower() - if 'session has expired' in message: - logging.debug(u' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' % - (self.feed.log_title[:30], self.feed.feed_address, e)) - self.feed.save_feed_history(560, "Facebook Error: Expired token") - return {} - - if not stories: - return {} - - return stories - - def page_posts_story(self, facebook_user, page_story): - categories = set() - if 'message' not in page_story: - # Probably a story shared on the page's timeline, not a published story - return - message = linebreaks(page_story['message']) - created_date = page_story['created_time'] - if isinstance(created_date, unicode): - created_date = dateutil.parser.parse(created_date) - fields = facebook_user.get_object(page_story['id'], fields='permalink_url,link,attachments') - permalink = fields.get('link', fields['permalink_url']) - attachments_html = "" - if fields.get('attachments', None) and fields['attachments']['data']: - for attachment in fields['attachments']['data']: - if 'media' in attachment: - attachments_html += "" % attachment['media']['image']['src'] - if attachment.get('subattachments', None): - for subattachment in attachment['subattachments']['data']: - attachments_html += "" % subattachment['media']['image']['src'] - - content = """
-
%s
-
%s
-
""" % ( - message, - attachments_html - ) - - story = { - 'title': message, - 'link': permalink, - 'description': content, - 'categories': list(categories), - 'unique_id': "fb_post:%s" % page_story['id'], - 'pubdate': created_date, - } - - return story - - def page_video_story(self, facebook_user, page_story): - categories = set() - if 'description' not in page_story: - return - message = linebreaks(page_story['description']) - created_date = page_story['updated_time'] - if isinstance(created_date, unicode): - created_date = dateutil.parser.parse(created_date) - permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url'] - embed_html = facebook_user.get_object(page_story['id'], fields='embed_html') - - if permalink.startswith('/'): - permalink = "https://www.facebook.com%s" % permalink - - content = """
-
%s
-
%s
-
""" % ( - message, - embed_html.get('embed_html', '') - ) - - story = { - 'title': page_story.get('story', message), - 'link': permalink, - 'description': content, - 'categories': list(categories), - 'unique_id': "fb_post:%s" % page_story['id'], - 'pubdate': created_date, - } - - return story - - def favicon_url(self): - page_name = self.extract_page_name() - facebook_user = self.facebook_user() - if not facebook_user: - logging.debug(u' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' % - (self.feed.log_title[:30], self.feed.feed_address)) - return - - try: - picture_data = facebook_user.get_object(page_name, fields='picture') - except GraphAPIError, e: - message = str(e).lower() - if 'session has expired' in message: - logging.debug(u' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' % - (self.feed.log_title[:30], self.feed.feed_address, e)) - return - - if 'picture' in picture_data: - return picture_data['picture']['data']['url'] - \ No newline at end of file diff --git a/utils/feed_fetcher.py.bak b/utils/feed_fetcher.py.bak deleted file mode 100644 index 0935cf3db..000000000 --- a/utils/feed_fetcher.py.bak +++ /dev/null @@ -1,933 +0,0 @@ -import time -import datetime -import traceback -import multiprocessing -import urllib2 -import xml.sax -import redis -import random -import pymongo -import re -import requests -import dateutil.parser -import isodate -import urlparse -from django.conf import settings -from django.db import IntegrityError -from django.core.cache import cache -from apps.reader.models import UserSubscription -from apps.rss_feeds.models import Feed, MStory -from apps.rss_feeds.page_importer import PageImporter -from apps.rss_feeds.icon_importer import IconImporter -from apps.notifications.tasks import QueueNotifications, MUserFeedNotification -from apps.push.models import PushSubscription -from apps.statistics.models import MAnalyticsFetcher, MStatistics -from utils import feedparser -from utils.story_functions import pre_process_story, strip_tags, linkify -from utils import log as logging -from utils.feed_functions import timelimit, TimeoutError -from qurl import qurl -from BeautifulSoup import BeautifulSoup -from django.utils import feedgenerator -from django.utils.html import linebreaks -from django.utils.encoding import smart_unicode -from utils import json_functions as json -from celery.exceptions import SoftTimeLimitExceeded -from utils.twitter_fetcher import TwitterFetcher -from utils.facebook_fetcher import FacebookFetcher -from utils.json_fetcher import JSONFetcher -# from utils.feed_functions import mail_feed_error_to_admin - - -# Refresh feed code adapted from Feedjack. -# http://feedjack.googlecode.com - -FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5) - - -class FetchFeed: - def __init__(self, feed_id, options): - self.feed = Feed.get_by_id(feed_id) - self.options = options - self.fpf = None - self.raw_feed = None - - @timelimit(30) - def fetch(self): - """ - Uses requests to download the feed, parsing it in feedparser. Will be storified later. - """ - start = time.time() - identity = self.get_identity() - log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, - self.feed.log_title[:30], - self.feed.id, - datetime.datetime.now() - self.feed.last_update) - logging.debug(log_msg) - - etag = self.feed.etag - modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None - address = self.feed.feed_address - - if (self.options.get('force') or random.random() <= .01): - self.options['force'] = True - modified = None - etag = None - address = qurl(address, add={"_": random.randint(0, 10000)}) - logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % ( - self.feed.log_title[:30], address)) - elif (not self.feed.fetched_once or not self.feed.known_good): - modified = None - etag = None - - if self.options.get('feed_xml'): - logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % ( - self.feed.log_title[:30], len(self.options.get('feed_xml')))) - - if self.options.get('fpf'): - self.fpf = self.options.get('fpf') - logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % ( - self.feed.log_title[:30])) - return FEED_OK, self.fpf - - if 'youtube.com' in address: - try: - youtube_feed = self.fetch_youtube(address) - except (requests.adapters.ConnectionError): - youtube_feed = None - if not youtube_feed: - logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % - (self.feed.log_title[:30], address)) - return FEED_ERRHTTP, None - self.fpf = feedparser.parse(youtube_feed) - elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])): - twitter_feed = self.fetch_twitter(address) - if not twitter_feed: - logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' % - (self.feed.log_title[:30], address)) - return FEED_ERRHTTP, None - self.fpf = feedparser.parse(twitter_feed) - elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])): - facebook_feed = self.fetch_facebook() - if not facebook_feed: - logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' % - (self.feed.log_title[:30], address)) - return FEED_ERRHTTP, None - self.fpf = feedparser.parse(facebook_feed) - - if not self.fpf: - try: - headers = self.feed.fetch_headers() - if etag: - headers['If-None-Match'] = etag - if modified: - # format into an RFC 1123-compliant timestamp. We can't use - # time.strftime() since the %a and %b directives can be affected - # by the current locale, but RFC 2616 states that dates must be - # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]) - headers['If-Modified-Since'] = modified_header - if etag or modified: - headers['A-IM'] = 'feed' - raw_feed = requests.get(address, headers=headers) - if raw_feed.status_code >= 400: - logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)) - raw_feed = requests.get(self.feed.feed_address, headers=self.feed.fetch_headers(fake=True)) - - if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""): - # JSON Feed - json_feed = self.fetch_json_feed(address, raw_feed) - if not json_feed: - logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' % - (self.feed.log_title[:30], address)) - return FEED_ERRHTTP, None - self.fpf = feedparser.parse(json_feed) - elif raw_feed.content and raw_feed.status_code < 400: - response_headers = raw_feed.headers - response_headers['Content-Location'] = raw_feed.url - self.raw_feed = smart_unicode(raw_feed.content) - self.fpf = feedparser.parse(self.raw_feed, - response_headers=response_headers) - if self.options.get('debug', False): - logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers)) - except Exception, e: - logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100])) - - if not self.fpf or self.options.get('force_fp', False): - try: - self.fpf = feedparser.parse(address, - agent=self.feed.user_agent, - etag=etag, - modified=modified) - except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: - logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % - (self.feed.log_title[:30], e)) - pass - - if not self.fpf: - try: - logging.debug(u' ***> [%-30s] ~FRTurning off headers...' % - (self.feed.log_title[:30])) - self.fpf = feedparser.parse(address, agent=self.feed.user_agent) - except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: - logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' % - (self.feed.log_title[:30], e)) - return FEED_ERRHTTP, None - - logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % ( - self.feed.log_title[:30], time.time() - start)) - - return FEED_OK, self.fpf - - def get_identity(self): - identity = "X" - - current_process = multiprocessing.current_process() - if current_process._identity: - identity = current_process._identity[0] - - return identity - - def fetch_twitter(self, address=None): - twitter_fetcher = TwitterFetcher(self.feed, self.options) - return twitter_fetcher.fetch(address) - - def fetch_facebook(self): - facebook_fetcher = FacebookFetcher(self.feed, self.options) - return facebook_fetcher.fetch() - - def fetch_json_feed(self, address, headers): - json_fetcher = JSONFetcher(self.feed, self.options) - return json_fetcher.fetch(address, headers) - - def fetch_youtube(self, address): - username = None - channel_id = None - list_id = None - - if 'gdata.youtube.com' in address: - try: - username_groups = re.search('gdata.youtube.com/feeds/\w+/users/(\w+)/', address) - if not username_groups: - return - username = username_groups.group(1) - except IndexError: - return - elif 'youtube.com/feeds/videos.xml?user=' in address: - try: - username = urlparse.parse_qs(urlparse.urlparse(address).query)['user'][0] - except IndexError: - return - elif 'youtube.com/feeds/videos.xml?channel_id=' in address: - try: - channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0] - except (IndexError, KeyError): - return - elif 'youtube.com/playlist' in address: - try: - list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0] - except IndexError: - return - elif 'youtube.com/feeds/videos.xml?playlist_id' in address: - try: - list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0] - except IndexError: - return - - if channel_id: - video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id, verify=False) - channel_json = requests.get("https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" % - (channel_id, settings.YOUTUBE_API_KEY)) - channel = json.decode(channel_json.content) - try: - username = channel['items'][0]['snippet']['title'] - description = channel['items'][0]['snippet']['description'] - except (IndexError, KeyError): - return - elif list_id: - playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" % - (list_id, settings.YOUTUBE_API_KEY)) - playlist = json.decode(playlist_json.content) - try: - username = playlist['items'][0]['snippet']['title'] - description = playlist['items'][0]['snippet']['description'] - except (IndexError, KeyError): - return - channel_url = "https://www.youtube.com/playlist?list=%s" % list_id - elif username: - video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username, verify=False) - description = "YouTube videos uploaded by %s" % username - else: - return - - if list_id: - playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" % - (list_id, settings.YOUTUBE_API_KEY)) - playlist = json.decode(playlist_json.content) - try: - video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']] - except (IndexError, KeyError): - return - else: - if video_ids_xml.status_code != 200: - return - video_ids_soup = BeautifulSoup(video_ids_xml.content) - channel_url = video_ids_soup.find('author').find('uri').getText() - video_ids = [] - for video_id in video_ids_soup.findAll('yt:videoid'): - video_ids.append(video_id.getText()) - - videos_json = requests.get("https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" % - (','.join(video_ids), settings.YOUTUBE_API_KEY)) - videos = json.decode(videos_json.content) - if 'error' in videos: - logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos)) - return - - data = {} - data['title'] = ("%s's YouTube Videos" % username if 'Uploads' not in username else username) - data['link'] = channel_url - data['description'] = description - data['lastBuildDate'] = datetime.datetime.utcnow() - data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL - data['docs'] = None - data['feed_url'] = address - rss = feedgenerator.Atom1Feed(**data) - - for video in videos['items']: - thumbnail = video['snippet']['thumbnails'].get('maxres') - if not thumbnail: - thumbnail = video['snippet']['thumbnails'].get('high') - if not thumbnail: - thumbnail = video['snippet']['thumbnails'].get('medium') - duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds - if duration_sec >= 3600: - hours = (duration_sec / 3600) - minutes = (duration_sec - (hours*3600)) / 60 - seconds = duration_sec - (hours*3600) - (minutes*60) - duration = "%s:%s:%s" % (hours, '{0:02d}'.format(minutes), '{0:02d}'.format(seconds)) - else: - minutes = duration_sec / 60 - seconds = duration_sec - (minutes*60) - duration = "%s:%s" % ('{0:02d}'.format(minutes), '{0:02d}'.format(seconds)) - content = """
-
- From: %s
- Duration: %s
-

-
%s
- """ % ( - ("https://www.youtube.com/embed/" + video['id']), - channel_url, username, - duration, - linkify(linebreaks(video['snippet']['description'])), - thumbnail['url'] if thumbnail else "", - ) - - link = "http://www.youtube.com/watch?v=%s" % video['id'] - story_data = { - 'title': video['snippet']['title'], - 'link': link, - 'description': content, - 'author_name': username, - 'categories': [], - 'unique_id': "tag:youtube.com,2008:video:%s" % video['id'], - 'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']), - } - rss.add_item(**story_data) - - return rss.writeString('utf-8') - - -class ProcessFeed: - def __init__(self, feed_id, fpf, options, raw_feed=None): - self.feed_id = feed_id - self.options = options - self.fpf = fpf - self.raw_feed = raw_feed - - def refresh_feed(self): - self.feed = Feed.get_by_id(self.feed_id) - if self.feed_id != self.feed.pk: - logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk)) - self.feed_id = self.feed.pk - - def process(self): - """ Downloads and parses a feed. - """ - start = time.time() - self.refresh_feed() - - ret_values = dict(new=0, updated=0, same=0, error=0) - - if hasattr(self.fpf, 'status'): - if self.options['verbose']: - if self.fpf.bozo and self.fpf.status != 304: - logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % ( - self.feed.log_title[:30], - self.fpf.bozo_exception, - len(self.fpf.entries))) - - if self.fpf.status == 304: - self.feed = self.feed.save() - self.feed.save_feed_history(304, "Not modified") - return FEED_SAME, ret_values - - # 302 and 307: Temporary redirect: ignore - # 301 and 308: Permanent redirect: save it (after 10 tries) - if self.fpf.status == 301 or self.fpf.status == 308: - if self.fpf.href.endswith('feedburner.com/atom.xml'): - return FEED_ERRHTTP, ret_values - redirects, non_redirects = self.feed.count_redirects_in_history('feed') - self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (10-len(redirects))) - if len(redirects) >= 10 or len(non_redirects) == 0: - address = self.fpf.href - if self.options['force'] and address: - address = qurl(address, remove=['_']) - self.feed.feed_address = address - if not self.feed.known_good: - self.feed.fetched_once = True - logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.log_title[:30], self.fpf.status)) - self.feed = self.feed.schedule_feed_fetch_immediately() - if not self.fpf.entries: - self.feed = self.feed.save() - self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") - return FEED_ERRHTTP, ret_values - if self.fpf.status >= 400: - logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.log_title[:30], self.fpf.status)) - fixed_feed = None - if not self.feed.known_good: - fixed_feed, feed = self.feed.check_feed_link_for_feed_address() - if not fixed_feed: - self.feed.save_feed_history(self.fpf.status, "HTTP Error") - else: - self.feed = feed - self.feed = self.feed.save() - return FEED_ERRHTTP, ret_values - - if not self.fpf: - logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30])) - self.feed.save_feed_history(551, "Broken feed") - return FEED_ERRHTTP, ret_values - - if self.fpf and not self.fpf.entries: - if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): - logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) - fixed_feed = None - if not self.feed.known_good: - fixed_feed, feed = self.feed.check_feed_link_for_feed_address() - if not fixed_feed: - self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) - else: - self.feed = feed - self.feed = self.feed.save() - return FEED_ERRPARSE, ret_values - elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): - logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) - fixed_feed = None - if not self.feed.known_good: - fixed_feed, feed = self.feed.check_feed_link_for_feed_address() - if not fixed_feed: - self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception) - else: - self.feed = feed - self.feed = self.feed.save() - return FEED_ERRPARSE, ret_values - - # the feed has changed (or it is the first time we parse it) - # saving the etag and last_modified fields - original_etag = self.feed.etag - self.feed.etag = self.fpf.get('etag') - if self.feed.etag: - self.feed.etag = self.feed.etag[:255] - # some times this is None (it never should) *sigh* - if self.feed.etag is None: - self.feed.etag = '' - if self.feed.etag != original_etag: - self.feed.save(update_fields=['etag']) - - original_last_modified = self.feed.last_modified - if hasattr(self.fpf, 'modified') and self.fpf.modified: - try: - self.feed.last_modified = datetime.datetime.strptime(self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z') - except Exception, e: - self.feed.last_modified = None - logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e)) - pass - if self.feed.last_modified != original_last_modified: - self.feed.save(update_fields=['last_modified']) - - self.fpf.entries = self.fpf.entries[:100] - - original_title = self.feed.feed_title - if self.fpf.feed.get('title'): - self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) - if self.feed.feed_title != original_title: - self.feed.save(update_fields=['feed_title']) - - tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) - if tagline: - original_tagline = self.feed.data.feed_tagline - self.feed.data.feed_tagline = smart_unicode(tagline) - if self.feed.data.feed_tagline != original_tagline: - self.feed.data.save(update_fields=['feed_tagline']) - - if not self.feed.feed_link_locked: - new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link - if self.options['force'] and new_feed_link: - new_feed_link = qurl(new_feed_link, remove=['_']) - if new_feed_link != self.feed.feed_link: - logging.debug(" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.log_title[:30], self.feed.feed_link, new_feed_link)) - redirects, non_redirects = self.feed.count_redirects_in_history('page') - self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10-len(redirects))) - if len(redirects) >= 10 or len(non_redirects) == 0: - self.feed.feed_link = new_feed_link - self.feed.save(update_fields=['feed_link']) - - # Determine if stories aren't valid and replace broken guids - guids_seen = set() - permalinks_seen = set() - for entry in self.fpf.entries: - guids_seen.add(entry.get('guid')) - permalinks_seen.add(Feed.get_permalink(entry)) - guid_difference = len(guids_seen) != len(self.fpf.entries) - single_guid = len(guids_seen) == 1 - replace_guids = single_guid and guid_difference - permalink_difference = len(permalinks_seen) != len(self.fpf.entries) - single_permalink = len(permalinks_seen) == 1 - replace_permalinks = single_permalink and permalink_difference - - # Compare new stories to existing stories, adding and updating - start_date = datetime.datetime.utcnow() - story_hashes = [] - stories = [] - for entry in self.fpf.entries: - story = pre_process_story(entry, self.fpf.encoding) - if not story['title'] and not story['story_content']: continue - if story.get('published') < start_date: - start_date = story.get('published') - if replace_guids: - if replace_permalinks: - new_story_guid = unicode(story.get('published')) - if self.options['verbose']: - logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % ( - self.feed.log_title[:30], - story.get('guid'), new_story_guid)) - story['guid'] = new_story_guid - else: - new_story_guid = Feed.get_permalink(story) - if self.options['verbose']: - logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % ( - self.feed.log_title[:30], - story.get('guid'), new_story_guid)) - story['guid'] = new_story_guid - story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid')) - stories.append(story) - story_hashes.append(story.get('story_hash')) - - original_story_hash_count = len(story_hashes) - story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[:original_story_hash_count] - story_hashes.extend(story_hashes_in_unread_cutoff) - story_hashes = list(set(story_hashes)) - if self.options['verbose'] or settings.DEBUG: - logging.debug(u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % ( - self.feed.log_title[:30], - original_story_hash_count, len(story_hashes)-original_story_hash_count, - len(story_hashes_in_unread_cutoff))) - - - existing_stories = dict((s.story_hash, s) for s in MStory.objects( - story_hash__in=story_hashes, - # story_date__gte=start_date, - # story_feed_id=self.feed.pk - )) - # if len(existing_stories) == 0: - # existing_stories = dict((s.story_hash, s) for s in MStory.objects( - # story_date__gte=start_date, - # story_feed_id=self.feed.pk - # )) - - ret_values = self.feed.add_update_stories(stories, existing_stories, - verbose=self.options['verbose'], - updates_off=self.options['updates_off']) - - # PubSubHubbub - if (hasattr(self.fpf, 'feed') and - hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): - hub_url = None - self_url = self.feed.feed_address - for link in self.fpf.feed.links: - if link['rel'] == 'hub' and not hub_url: - hub_url = link['href'] - elif link['rel'] == 'self': - self_url = link['href'] - push_expired = False - if self.feed.is_push: - try: - push_expired = self.feed.push.lease_expires < datetime.datetime.now() - except PushSubscription.DoesNotExist: - self.feed.is_push = False - if (hub_url and self_url and not settings.DEBUG and - self.feed.active_subscribers > 0 and - (push_expired or not self.feed.is_push or self.options.get('force'))): - logging.debug(u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % ( - self.feed.log_title[:30], - "~SKRe-~SN" if push_expired else "", hub_url)) - try: - PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) - except TimeoutError: - logging.debug(u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % ( - self.feed.log_title[:30], hub_url)) - elif (self.feed.is_push and - (self.feed.active_subscribers <= 0 or not hub_url)): - logging.debug(u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % ( - self.feed.log_title[:30])) - self.feed.is_push = False - self.feed = self.feed.save() - - # Push notifications - if ret_values['new'] > 0 and MUserFeedNotification.feed_has_users(self.feed.pk) > 0: - QueueNotifications.delay(self.feed.pk, ret_values['new']) - - # All Done - logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % ( - self.feed.log_title[:30], - '~FG~SB' if ret_values['new'] else '', ret_values['new'], - '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], - '~SB' if ret_values['same'] else '', ret_values['same'], - '~FR~SB' if ret_values['error'] else '', ret_values['error'], - len(self.fpf.entries))) - self.feed.update_all_statistics(has_new_stories=bool(ret_values['new']), force=self.options['force']) - fetch_date = datetime.datetime.now() - if ret_values['new']: - if not getattr(settings, 'TEST_DEBUG', False): - self.feed.trim_feed() - self.feed.expire_redis() - if MStatistics.get('raw_feed', None) == self.feed.pk: - self.feed.save_raw_feed(self.raw_feed, fetch_date) - self.feed.save_feed_history(200, "OK", date=fetch_date) - - if self.options['verbose']: - logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % ( - self.feed.log_title[:30], time.time() - start)) - - return FEED_OK, ret_values - - -class Dispatcher: - def __init__(self, options, num_threads): - self.options = options - self.feed_stats = { - FEED_OK:0, - FEED_SAME:0, - FEED_ERRPARSE:0, - FEED_ERRHTTP:0, - FEED_ERREXC:0} - self.feed_trans = { - FEED_OK:'ok', - FEED_SAME:'unchanged', - FEED_ERRPARSE:'cant_parse', - FEED_ERRHTTP:'http_error', - FEED_ERREXC:'exception'} - self.feed_keys = sorted(self.feed_trans.keys()) - self.num_threads = num_threads - self.time_start = datetime.datetime.utcnow() - self.workers = [] - - def refresh_feed(self, feed_id): - """Update feed, since it may have changed""" - return Feed.get_by_id(feed_id) - - def process_feed_wrapper(self, feed_queue): - delta = None - current_process = multiprocessing.current_process() - identity = "X" - feed = None - - if current_process._identity: - identity = current_process._identity[0] - - for feed_id in feed_queue: - start_duration = time.time() - feed_fetch_duration = None - feed_process_duration = None - page_duration = None - icon_duration = None - feed_code = None - ret_entries = None - start_time = time.time() - ret_feed = FEED_ERREXC - try: - feed = self.refresh_feed(feed_id) - - skip = False - if self.options.get('fake'): - skip = True - weight = "-" - quick = "-" - rand = "-" - elif (self.options.get('quick') and not self.options['force'] and - feed.known_good and feed.fetched_once and not feed.is_push): - weight = feed.stories_last_month * feed.num_subscribers - random_weight = random.randint(1, max(weight, 1)) - quick = float(self.options.get('quick', 0)) - rand = random.random() - if random_weight < 1000 and rand < quick: - skip = True - elif False and feed.feed_address.startswith("http://news.google.com/news"): - skip = True - weight = "-" - quick = "-" - rand = "-" - if skip: - logging.debug(' ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)...' % ( - feed.log_title[:30], - weight, - feed.num_subscribers, - rand, quick)) - continue - - ffeed = FetchFeed(feed_id, self.options) - ret_feed, fetched_feed = ffeed.fetch() - - feed_fetch_duration = time.time() - start_duration - raw_feed = ffeed.raw_feed - - if ((fetched_feed and ret_feed == FEED_OK) or self.options['force']): - pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed) - ret_feed, ret_entries = pfeed.process() - feed = pfeed.feed - feed_process_duration = time.time() - start_duration - - if (ret_entries and ret_entries['new']) or self.options['force']: - start = time.time() - if not feed.known_good or not feed.fetched_once: - feed.known_good = True - feed.fetched_once = True - feed = feed.save() - if self.options['force'] or random.random() <= 0.02: - logging.debug(' ---> [%-30s] ~FBPerforming feed cleanup...' % (feed.log_title[:30],)) - start_cleanup = time.time() - feed.sync_redis() - logging.debug(' ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec.' % (feed.log_title[:30], time.time() - start_cleanup)) - try: - self.count_unreads_for_subscribers(feed) - except TimeoutError: - logging.debug(' ---> [%-30s] Unread count took too long...' % (feed.log_title[:30],)) - if self.options['verbose']: - logging.debug(u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % ( - feed.log_title[:30], time.time() - start)) - except urllib2.HTTPError, e: - logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read())) - feed_code = e.code - feed.save_feed_history(feed_code, e.msg, e.fp.read()) - fetched_feed = None - except Feed.DoesNotExist, e: - logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30])) - continue - except SoftTimeLimitExceeded, e: - logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed) - ret_feed = FEED_ERREXC - fetched_feed = None - feed_code = 559 - feed.save_feed_history(feed_code, 'Timeout', e) - except TimeoutError, e: - logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30])) - feed_code = 505 - feed.save_feed_history(feed_code, 'Timeout', e) - fetched_feed = None - except Exception, e: - logging.debug('[%d] ! -------------------------' % (feed_id,)) - tb = traceback.format_exc() - logging.error(tb) - logging.debug('[%d] ! -------------------------' % (feed_id,)) - ret_feed = FEED_ERREXC - feed = Feed.get_by_id(getattr(feed, 'pk', feed_id)) - if not feed: continue - feed.save_feed_history(500, "Error", tb) - feed_code = 500 - fetched_feed = None - # mail_feed_error_to_admin(feed, e, local_vars=locals()) - if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and - settings.RAVEN_CLIENT): - settings.RAVEN_CLIENT.captureException() - - if not feed_code: - if ret_feed == FEED_OK: - feed_code = 200 - elif ret_feed == FEED_SAME: - feed_code = 304 - elif ret_feed == FEED_ERRHTTP: - feed_code = 400 - if ret_feed == FEED_ERREXC: - feed_code = 500 - elif ret_feed == FEED_ERRPARSE: - feed_code = 550 - - if not feed: continue - feed = self.refresh_feed(feed.pk) - if not feed: continue - - if ((self.options['force']) or - (random.random() > .9) or - (fetched_feed and - feed.feed_link and - feed.has_page and - (ret_feed == FEED_OK or - (ret_feed == FEED_SAME and feed.stories_last_month > 10)))): - - logging.debug(u' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link)) - page_importer = PageImporter(feed) - try: - page_data = page_importer.fetch_page() - page_duration = time.time() - start_duration - except SoftTimeLimitExceeded, e: - logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed) - page_data = None - feed.save_feed_history(557, 'Timeout', e) - except TimeoutError, e: - logging.debug(' ---> [%-30s] ~FRPage fetch timed out...' % (feed.log_title[:30])) - page_data = None - feed.save_page_history(555, 'Timeout', '') - except Exception, e: - logging.debug('[%d] ! -------------------------' % (feed_id,)) - tb = traceback.format_exc() - logging.error(tb) - logging.debug('[%d] ! -------------------------' % (feed_id,)) - feed.save_page_history(550, "Page Error", tb) - fetched_feed = None - page_data = None - # mail_feed_error_to_admin(feed, e, local_vars=locals()) - if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and - settings.RAVEN_CLIENT): - settings.RAVEN_CLIENT.captureException() - - feed = self.refresh_feed(feed.pk) - logging.debug(u' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link)) - force = self.options['force'] - if random.random() > .99: - force = True - icon_importer = IconImporter(feed, page_data=page_data, force=force) - try: - icon_importer.save() - icon_duration = time.time() - start_duration - except SoftTimeLimitExceeded, e: - logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed) - feed.save_feed_history(558, 'Timeout', e) - except TimeoutError, e: - logging.debug(' ---> [%-30s] ~FRIcon fetch timed out...' % (feed.log_title[:30])) - feed.save_page_history(556, 'Timeout', '') - except Exception, e: - logging.debug('[%d] ! -------------------------' % (feed_id,)) - tb = traceback.format_exc() - logging.error(tb) - logging.debug('[%d] ! -------------------------' % (feed_id,)) - # feed.save_feed_history(560, "Icon Error", tb) - # mail_feed_error_to_admin(feed, e, local_vars=locals()) - if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and - settings.RAVEN_CLIENT): - settings.RAVEN_CLIENT.captureException() - else: - logging.debug(u' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]')) - - feed = self.refresh_feed(feed.pk) - delta = time.time() - start_time - - feed.last_load_time = round(delta) - feed.fetched_once = True - try: - feed = feed.save(update_fields=['last_load_time', 'fetched_once']) - except IntegrityError: - logging.debug(" ***> [%-30s] ~FRIntegrityError on feed: %s" % (feed.log_title[:30], feed.feed_address,)) - - if ret_entries and ret_entries['new']: - self.publish_to_subscribers(feed, ret_entries['new']) - - done_msg = (u'%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % ( - identity, feed.log_title[:30], delta, - feed.pk, self.feed_trans[ret_feed],)) - logging.debug(done_msg) - total_duration = time.time() - start_duration - MAnalyticsFetcher.add(feed_id=feed.pk, feed_fetch=feed_fetch_duration, - feed_process=feed_process_duration, - page=page_duration, icon=icon_duration, - total=total_duration, feed_code=feed_code) - - self.feed_stats[ret_feed] += 1 - - if len(feed_queue) == 1: - return feed - - # time_taken = datetime.datetime.utcnow() - self.time_start - - def publish_to_subscribers(self, feed, new_count): - try: - r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL) - listeners_count = r.publish(str(feed.pk), 'story:new_count:%s' % new_count) - if listeners_count: - logging.debug(" ---> [%-30s] ~FMPublished to %s subscribers" % (feed.log_title[:30], listeners_count)) - except redis.ConnectionError: - logging.debug(" ***> [%-30s] ~BMRedis is unavailable for real-time." % (feed.log_title[:30],)) - - def count_unreads_for_subscribers(self, feed): - user_subs = UserSubscription.objects.filter(feed=feed, - active=True, - user__profile__last_seen_on__gte=feed.unread_cutoff)\ - .order_by('-last_read_date') - - if not user_subs.count(): - return - - for sub in user_subs: - if not sub.needs_unread_recalc: - sub.needs_unread_recalc = True - sub.save() - - if self.options['compute_scores']: - r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) - stories = MStory.objects(story_feed_id=feed.pk, - story_date__gte=feed.unread_cutoff) - stories = Feed.format_stories(stories, feed.pk) - story_hashes = r.zrangebyscore('zF:%s' % feed.pk, int(feed.unread_cutoff.strftime('%s')), - int(time.time() + 60*60*24)) - missing_story_hashes = set(story_hashes) - set([s['story_hash'] for s in stories]) - if missing_story_hashes: - missing_stories = MStory.objects(story_feed_id=feed.pk, - story_hash__in=missing_story_hashes)\ - .read_preference(pymongo.ReadPreference.PRIMARY) - missing_stories = Feed.format_stories(missing_stories, feed.pk) - stories = missing_stories + stories - logging.debug(u' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories))) - cache.set("S:%s" % feed.pk, stories, 60) - logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % ( - feed.log_title[:30], len(stories), user_subs.count(), - feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) - self.calculate_feed_scores_with_stories(user_subs, stories) - elif self.options.get('mongodb_replication_lag'): - logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % ( - feed.log_title[:30], self.options.get('mongodb_replication_lag'))) - - @timelimit(10) - def calculate_feed_scores_with_stories(self, user_subs, stories): - for sub in user_subs: - silent = False if self.options['verbose'] >= 2 else True - sub.calculate_feed_scores(silent=silent, stories=stories) - - def add_jobs(self, feeds_queue, feeds_count=1): - """ adds a feed processing job to the pool - """ - self.feeds_queue = feeds_queue - self.feeds_count = feeds_count - - def run_jobs(self): - if self.options['single_threaded']: - return self.process_feed_wrapper(self.feeds_queue[0]) - else: - for i in range(self.num_threads): - feed_queue = self.feeds_queue[i] - self.workers.append(multiprocessing.Process(target=self.process_feed_wrapper, - args=(feed_queue,))) - for i in range(self.num_threads): - self.workers[i].start() diff --git a/utils/feed_functions.py.bak b/utils/feed_functions.py.bak deleted file mode 100644 index 2b22a9afa..000000000 --- a/utils/feed_functions.py.bak +++ /dev/null @@ -1,410 +0,0 @@ -import datetime -import threading -import sys -import traceback -import pprint -import urllib.request, urllib.parse, urllib.error -import urllib.parse -import random -import warnings -from django.core.mail import mail_admins -from django.utils.translation import ungettext -from django.utils.encoding import smart_unicode -from utils import log as logging - - -class TimeoutError(Exception): pass -def timelimit(timeout): - """borrowed from web.py""" - def _1(function): - def _2(*args, **kw): - class Dispatch(threading.Thread): - def __init__(self): - threading.Thread.__init__(self) - self.result = None - self.error = None - - self.setDaemon(True) - self.start() - - def run(self): - try: - self.result = function(*args, **kw) - except: - self.error = sys.exc_info() - c = Dispatch() - c.join(timeout) - if c.isAlive(): - raise TimeoutError('took too long') - if c.error: - tb = ''.join(traceback.format_exception(c.error[0], c.error[1], c.error[2])) - logging.debug(tb) - mail_admins('Error in timeout: %s' % c.error[0], tb) - raise c.error[0](c.error[1]).with_traceback(c.error[2]) - return c.result - return _2 - return _1 - - -def utf8encode(tstr): - """ Encodes a unicode string in utf-8 - """ - msg = "utf8encode is deprecated. Use django.utils.encoding.smart_unicode instead." - warnings.warn(msg, DeprecationWarning) - return smart_unicode(tstr) - -# From: http://www.poromenos.org/node/87 -def levenshtein_distance(first, second): - """Find the Levenshtein distance between two strings.""" - if len(first) > len(second): - first, second = second, first - if len(second) == 0: - return len(first) - first_length = len(first) + 1 - second_length = len(second) + 1 - distance_matrix = [[0] * second_length for x in range(first_length)] - for i in range(first_length): - distance_matrix[i][0] = i - for j in range(second_length): - distance_matrix[0][j]=j - for i in range(1, first_length): - for j in range(1, second_length): - deletion = distance_matrix[i-1][j] + 1 - insertion = distance_matrix[i][j-1] + 1 - substitution = distance_matrix[i-1][j-1] - if first[i-1] != second[j-1]: - substitution += 1 - distance_matrix[i][j] = min(insertion, deletion, substitution) - return distance_matrix[first_length-1][second_length-1] - -def _do_timesince(d, chunks, now=None): - """ - Started as a copy of django.util.timesince.timesince, but modified to - only output one time unit, and use months as the maximum unit of measure. - - Takes two datetime objects and returns the time between d and now - as a nicely formatted string, e.g. "10 minutes". If d occurs after now, - then "0 minutes" is returned. - - Units used are months, weeks, days, hours, and minutes. - Seconds and microseconds are ignored. - """ - # Convert datetime.date to datetime.datetime for comparison - if d.__class__ is not datetime.datetime: - d = datetime.datetime(d.year, d.month, d.day) - - if not now: - now = datetime.datetime.utcnow() - - # ignore microsecond part of 'd' since we removed it from 'now' - delta = now - (d - datetime.timedelta(0, 0, d.microsecond)) - since = delta.days * 24 * 60 * 60 + delta.seconds - if since > 10: - for i, (seconds, name) in enumerate(chunks): - count = since // seconds - if count != 0: - break - s = '%(number)d %(type)s' % {'number': count, 'type': name(count)} - else: - s = 'just a second' - return s - -def relative_timesince(value): - if not value: - return '' - - chunks = ( - (60 * 60 * 24, lambda n: ungettext('day', 'days', n)), - (60 * 60, lambda n: ungettext('hour', 'hours', n)), - (60, lambda n: ungettext('minute', 'minutes', n)), - (1, lambda n: ungettext('second', 'seconds', n)), - (0, lambda n: 'just now'), - ) - return _do_timesince(value, chunks) - -def relative_timeuntil(value): - if not value: - return '' - - chunks = ( - (60 * 60, lambda n: ungettext('hour', 'hours', n)), - (60, lambda n: ungettext('minute', 'minutes', n)) - ) - - now = datetime.datetime.utcnow() - - return _do_timesince(now, chunks, value) - -def seconds_timesince(value): - if not value: - return 0 - now = datetime.datetime.utcnow() - delta = now - value - - return delta.days * 24 * 60 * 60 + delta.seconds - -def format_relative_date(date, future=False): - if not date or date < datetime.datetime(2010, 1, 1): - return "Soon" - - now = datetime.datetime.utcnow() - diff = abs(now - date) - if diff < datetime.timedelta(minutes=60): - minutes = diff.seconds / 60 - return "%s minute%s %s" % (minutes, - '' if minutes == 1 else 's', - '' if future else 'ago') - elif datetime.timedelta(minutes=60) <= diff < datetime.timedelta(minutes=90): - return "1 hour %s" % ('' if future else 'ago') - elif diff < datetime.timedelta(hours=24): - dec = (diff.seconds / 60 + 15) % 60 - if dec >= 30: - return "%s.5 hours %s" % ((((diff.seconds / 60) + 15) / 60), - '' if future else 'ago') - else: - return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60), - '' if future else 'ago') - else: - days = ((diff.seconds / 60) / 60 / 24) - return "%s day%s %s" % (days, '' if days == 1 else 's', '' if future else 'ago') - -def add_object_to_folder(obj, in_folder, folders, parent='', added=False): - obj_identifier = obj - if isinstance(obj, dict): - obj_identifier = list(obj.keys())[0] - - if ((not in_folder or in_folder == " ") and - not parent and - not isinstance(obj, dict) and - obj_identifier not in folders): - folders.append(obj) - return folders - - child_folder_names = [] - for item in folders: - if isinstance(item, dict): - child_folder_names.append(list(item.keys())[0]) - if isinstance(obj, dict) and in_folder.lower() == parent.lower(): - if obj_identifier not in child_folder_names: - folders.append(obj) - return folders - - for k, v in enumerate(folders): - if isinstance(v, dict): - for f_k, f_v in list(v.items()): - if f_k.lower() == in_folder.lower() and obj_identifier not in f_v and not added: - f_v.append(obj) - added = True - folders[k][f_k] = add_object_to_folder(obj, in_folder, f_v, f_k, added) - - return folders - -def mail_feed_error_to_admin(feed, e, local_vars=None, subject=None): - # Mail the admins with the error - if not subject: - subject = "Feed update error" - exc_info = sys.exc_info() - subject = '%s: %s' % (subject, repr(e)) - message = 'Traceback:\n%s\n\Feed:\n%s\nLocals:\n%s' % ( - '\n'.join(traceback.format_exception(*exc_info)), - pprint.pformat(feed.__dict__), - pprint.pformat(local_vars) - ) - # print message - mail_admins(subject, message) - -## {{{ http://code.activestate.com/recipes/576611/ (r11) -from operator import itemgetter -from heapq import nlargest -from itertools import repeat - -class Counter(dict): - '''Dict subclass for counting hashable objects. Sometimes called a bag - or multiset. Elements are stored as dictionary keys and their counts - are stored as dictionary values. - - >>> Counter('zyzygy') - Counter({'y': 3, 'z': 2, 'g': 1}) - - ''' - - def __init__(self, iterable=None, **kwds): - '''Create a new, empty Counter object. And if given, count elements - from an input iterable. Or, initialize the count from another mapping - of elements to their counts. - - >>> c = Counter() # a new, empty counter - >>> c = Counter('gallahad') # a new counter from an iterable - >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping - >>> c = Counter(a=4, b=2) # a new counter from keyword args - - ''' - self.update(iterable, **kwds) - - def __missing__(self, key): - return 0 - - def most_common(self, n=None): - '''List the n most common elements and their counts from the most - common to the least. If n is None, then list all element counts. - - >>> Counter('abracadabra').most_common(3) - [('a', 5), ('r', 2), ('b', 2)] - - ''' - if n is None: - return sorted(iter(self.items()), key=itemgetter(1), reverse=True) - return nlargest(n, iter(self.items()), key=itemgetter(1)) - - def elements(self): - '''Iterator over elements repeating each as many times as its count. - - >>> c = Counter('ABCABC') - >>> sorted(c.elements()) - ['A', 'A', 'B', 'B', 'C', 'C'] - - If an element's count has been set to zero or is a negative number, - elements() will ignore it. - - ''' - for elem, count in self.items(): - for _ in repeat(None, count): - yield elem - - # Override dict methods where the meaning changes for Counter objects. - - @classmethod - def fromkeys(cls, iterable, v=None): - raise NotImplementedError( - 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') - - def update(self, iterable=None, **kwds): - '''Like dict.update() but add counts instead of replacing them. - - Source can be an iterable, a dictionary, or another Counter instance. - - >>> c = Counter('which') - >>> c.update('witch') # add elements from another iterable - >>> d = Counter('watch') - >>> c.update(d) # add elements from another counter - >>> c['h'] # four 'h' in which, witch, and watch - 4 - - ''' - if iterable is not None: - if hasattr(iterable, 'iteritems'): - if self: - self_get = self.get - for elem, count in iterable.items(): - self[elem] = self_get(elem, 0) + count - else: - dict.update(self, iterable) # fast path when counter is empty - else: - self_get = self.get - for elem in iterable: - self[elem] = self_get(elem, 0) + 1 - if kwds: - self.update(kwds) - - def copy(self): - 'Like dict.copy() but returns a Counter instance instead of a dict.' - return Counter(self) - - def __delitem__(self, elem): - 'Like dict.__delitem__() but does not raise KeyError for missing values.' - if elem in self: - dict.__delitem__(self, elem) - - def __repr__(self): - if not self: - return '%s()' % self.__class__.__name__ - items = ', '.join(map('%r: %r'.__mod__, self.most_common())) - return '%s({%s})' % (self.__class__.__name__, items) - - # Multiset-style mathematical operations discussed in: - # Knuth TAOCP Volume II section 4.6.3 exercise 19 - # and at http://en.wikipedia.org/wiki/Multiset - # - # Outputs guaranteed to only include positive counts. - # - # To strip negative and zero counts, add-in an empty counter: - # c += Counter() - - def __add__(self, other): - '''Add counts from two counters. - - >>> Counter('abbb') + Counter('bcc') - Counter({'b': 4, 'c': 2, 'a': 1}) - - - ''' - if not isinstance(other, Counter): - return NotImplemented - result = Counter() - for elem in set(self) | set(other): - newcount = self[elem] + other[elem] - if newcount > 0: - result[elem] = newcount - return result - - def __sub__(self, other): - ''' Subtract count, but keep only results with positive counts. - - >>> Counter('abbbc') - Counter('bccd') - Counter({'b': 2, 'a': 1}) - - ''' - if not isinstance(other, Counter): - return NotImplemented - result = Counter() - for elem in set(self) | set(other): - newcount = self[elem] - other[elem] - if newcount > 0: - result[elem] = newcount - return result - - def __or__(self, other): - '''Union is the maximum of value in either of the input counters. - - >>> Counter('abbb') | Counter('bcc') - Counter({'b': 3, 'c': 2, 'a': 1}) - - ''' - if not isinstance(other, Counter): - return NotImplemented - _max = max - result = Counter() - for elem in set(self) | set(other): - newcount = _max(self[elem], other[elem]) - if newcount > 0: - result[elem] = newcount - return result - - def __and__(self, other): - ''' Intersection is the minimum of corresponding counts. - - >>> Counter('abbb') & Counter('bcc') - Counter({'b': 1}) - - ''' - if not isinstance(other, Counter): - return NotImplemented - _min = min - result = Counter() - if len(self) < len(other): - self, other = other, self - for elem in filter(self.__contains__, other): - newcount = _min(self[elem], other[elem]) - if newcount > 0: - result[elem] = newcount - return result - - -if __name__ == '__main__': - import doctest - print(doctest.testmod()) -## end of http://code.activestate.com/recipes/576611/ }}} - -def chunks(l, n): - for i in range(0, len(l), n): - yield l[i:i+n] diff --git a/utils/feedfinder.py.bak b/utils/feedfinder.py.bak deleted file mode 100644 index c79f99cdb..000000000 --- a/utils/feedfinder.py.bak +++ /dev/null @@ -1,373 +0,0 @@ -"""feedfinder: Find the Web feed for a Web page -http://www.aaronsw.com/2002/feedfinder/ - -Usage: - feed(uri) - returns feed found for a URI - feeds(uri) - returns all feeds found for a URI - - >>> import feedfinder - >>> feedfinder.feed('scripting.com') - 'http://scripting.com/rss.xml' - >>> - >>> feedfinder.feeds('scripting.com') - ['http://delong.typepad.com/sdj/atom.xml', - 'http://delong.typepad.com/sdj/index.rdf', - 'http://delong.typepad.com/sdj/rss.xml'] - >>> - -Can also use from the command line. Feeds are returned one per line: - - $ python feedfinder.py diveintomark.org - http://diveintomark.org/xml/atom.xml - -How it works: - 0. At every step, feeds are minimally verified to make sure they are really feeds. - 1. If the URI points to a feed, it is simply returned; otherwise - the page is downloaded and the real fun begins. - 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery) - 3. links to feeds on the same server ending in ".rss", ".rdf", ".xml", or - ".atom" - 4. links to feeds on the same server containing "rss", "rdf", "xml", or "atom" - 5. links to feeds on external servers ending in ".rss", ".rdf", ".xml", or - ".atom" - 6. links to feeds on external servers containing "rss", "rdf", "xml", or "atom" - 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.). - 8. As a last ditch effort, we search Syndic8 for feeds matching the URI -""" - -__version__ = "1.371" -__date__ = "2006-04-24" -__maintainer__ = "Aaron Swartz (me@aaronsw.com)" -__author__ = "Mark Pilgrim (http://diveintomark.org)" -__copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz" -__license__ = "Python" -__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity -Also Jason Diamond, Brian Lalor for bug reporting and patches""" - -_debug = 0 - -import sgmllib, urllib.request, urllib.parse, urllib.error, urllib.parse, re, sys, urllib.robotparser -import requests -from io import StringIO -from lxml import etree - - -# XML-RPC support allows feedfinder to query Syndic8 for possible matches. -# Python 2.3 now comes with this module by default, otherwise you can download it -try: - import xmlrpc.client # http://www.pythonware.com/products/xmlrpc/ -except ImportError: - xmlrpclib = None - -if not dict: - def dict(aList): - rc = {} - for k, v in aList: - rc[k] = v - return rc - -def _debuglog(message): - if _debug: print(message) - -class URLGatekeeper: - """a class to track robots.txt rules across multiple servers""" - def __init__(self): - self.rpcache = {} # a dictionary of RobotFileParser objects, by domain - self.urlopener = urllib.request.FancyURLopener() - self.urlopener.version = "NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)" - _debuglog(self.urlopener.version) - self.urlopener.addheaders = [('User-Agent', self.urlopener.version)] - # self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')] - urllib.robotparser.URLopener.version = self.urlopener.version - urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders - - def _getrp(self, url): - protocol, domain = urllib.parse.urlparse(url)[:2] - if domain in self.rpcache: - return self.rpcache[domain] - baseurl = '%s://%s' % (protocol, domain) - robotsurl = urllib.parse.urljoin(baseurl, 'robots.txt') - _debuglog('fetching %s' % robotsurl) - rp = urllib.robotparser.RobotFileParser(robotsurl) - try: - rp.read() - except: - pass - self.rpcache[domain] = rp - return rp - - def can_fetch(self, url): - rp = self._getrp(url) - allow = rp.can_fetch(self.urlopener.version, url) - _debuglog("gatekeeper of %s says %s" % (url, allow)) - return allow - - def get(self, url, check=False): - if check and not self.can_fetch(url): return '' - try: - return requests.get(url, headers=dict(self.urlopener.addheaders)).content - except: - return '' - -_gatekeeper = URLGatekeeper() - -class BaseParser(sgmllib.SGMLParser): - def __init__(self, baseuri): - sgmllib.SGMLParser.__init__(self) - self.links = [] - self.baseuri = baseuri - - def normalize_attrs(self, attrs): - def cleanattr(v): - v = sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v) - if not v: return - v = v.strip() - v = v.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&') - return v - attrs = [(k.lower(), cleanattr(v)) for k, v in attrs if cleanattr(v)] - attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs if cleanattr(v)] - return attrs - - def do_base(self, attrs): - attrsD = dict(self.normalize_attrs(attrs)) - if 'href' not in attrsD: return - self.baseuri = attrsD['href'] - - def error(self, *a, **kw): pass # we're not picky - -class LinkParser(BaseParser): - FEED_TYPES = ('application/rss+xml', - 'text/xml', - 'application/atom+xml', - 'application/x.atom+xml', - 'application/x-atom+xml') - def do_link(self, attrs): - attrsD = dict(self.normalize_attrs(attrs)) - if 'rel' not in attrsD: return - rels = attrsD['rel'].split() - if 'alternate' not in rels: return - if attrsD.get('type') not in self.FEED_TYPES: return - if 'href' not in attrsD: return - self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href'])) - -class ALinkParser(BaseParser): - def start_a(self, attrs): - attrsD = dict(self.normalize_attrs(attrs)) - if 'href' not in attrsD: return - self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href'])) - -def makeFullURI(uri): - if not uri: return - uri = uri.strip() - if uri.startswith('feed://'): - uri = 'http://' + uri.split('feed://', 1).pop() - for x in ['http', 'https']: - if uri.startswith('%s://' % x): - return uri - return 'http://%s' % uri - -def getLinks(data, baseuri): - p = LinkParser(baseuri) - p.feed(data) - return p.links - -def getLinksLXML(data, baseuri): - parser = etree.HTMLParser(recover=True) - tree = etree.parse(StringIO(data), parser) - links = [] - for link in tree.findall('.//link'): - if link.attrib.get('type') in LinkParser.FEED_TYPES: - href = link.attrib['href'] - if href: links.append(href) - return links - -def getALinks(data, baseuri): - p = ALinkParser(baseuri) - p.feed(data) - return p.links - -def getLocalLinks(links, baseuri): - found_links = [] - if not baseuri: return found_links - baseuri = baseuri.lower() - for l in links: - try: - if l.lower().startswith(baseuri): - found_links.append(l) - except (AttributeError, UnicodeDecodeError): - pass - return found_links - -def isFeedLink(link): - return link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom') - -def isXMLRelatedLink(link): - link = link.lower() - return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom') - -r_brokenRedirect = re.compile(']*>(.*?)', re.S) -def tryBrokenRedirect(data): - if ' links that point to feeds - _debuglog('no LINK tags, looking at A tags') - try: - links = getALinks(data, fulluri) - except: - links = [] - _debuglog('no LINK tags, looking at local links') - locallinks = getLocalLinks(links, fulluri) - # look for obvious feed links on the same server - outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, locallinks))))) - if all or not outfeeds: - # look harder for feed links on the same server - outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, locallinks))))) - if all or not outfeeds: - # look for obvious feed links on another server - outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, links))))) - if all or not outfeeds: - # look harder for feed links on another server - outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, links))))) - if all or not outfeeds: - _debuglog('no A tags, guessing') - suffixes = [ # filenames used by popular software: - 'feed/', # obvious - 'atom.xml', # blogger, TypePad - 'index.atom', # MT, apparently - 'index.rdf', # MT - 'rss.xml', # Dave Winer/Manila - 'index.xml', # MT - 'index.rss' # Slash - ] - outfeeds.extend(list(filter(isFeed, [urllib.parse.urljoin(fulluri, x) for x in suffixes]))) - if (all or not outfeeds) and querySyndic8: - # still no luck, search Syndic8 for feeds (requires xmlrpclib) - _debuglog('still no luck, searching Syndic8') - outfeeds.extend(getFeedsFromSyndic8(uri)) - if hasattr(__builtins__, 'set') or 'set' in __builtins__: - outfeeds = list(set(outfeeds)) - return outfeeds - -getFeeds = feeds # backwards-compatibility - -def feed(uri): - #todo: give preference to certain feed formats - feedlist = feeds(uri) - if feedlist: - feeds_no_comments = [f for f in feedlist if 'comments' not in f.lower()] - if feeds_no_comments: - return feeds_no_comments[0] - return feedlist[0] - else: - return None - -##### test harness ###### - -def test(): - uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html' - failed = [] - count = 0 - while 1: - data = _gatekeeper.get(uri) - if data.find('Atom autodiscovery test') == -1: break - sys.stdout.write('.') - sys.stdout.flush() - count += 1 - links = getLinks(data, uri) - if not links: - print('\n*** FAILED ***', uri, 'could not find link') - failed.append(uri) - elif len(links) > 1: - print('\n*** FAILED ***', uri, 'found too many links') - failed.append(uri) - else: - atomdata = urllib.request.urlopen(links[0]).read() - if atomdata.find(' tags. - logging.info("Looking for tags.") - try: - tree = BeautifulSoup(feed_text) - except ValueError: - return [] - links = [] - for link in tree.findAll("link"): - if link.get("type") in ["application/rss+xml", - "text/xml", - "application/atom+xml", - "application/x.atom+xml", - "application/x-atom+xml", - "application/json"]: - links.append(urlparse.urljoin(url, link.get("href", ""))) - - # Check the detected links. - urls = list(filter(finder.is_feed, links)) - logging.info("Found {0} feed tags.".format(len(urls))) - if len(urls) and not check_all: - return sort_urls(urls) - - # Look for tags. - logging.info("Looking for tags.") - local, remote = [], [] - for a in tree.findAll("a"): - href = a.get("href", None) - if href is None: - continue - if "://" not in href and finder.is_feed_url(href): - local.append(href) - if finder.is_feedlike_url(href): - remote.append(href) - - # Check the local URLs. - local = [urlparse.urljoin(url, l) for l in local] - urls += list(filter(finder.is_feed, local)) - logging.info("Found {0} local links to feeds.".format(len(urls))) - if len(urls) and not check_all: - return sort_urls(urls) - - # Check the remote URLs. - remote = [urlparse.urljoin(url, l) for l in remote] - urls += list(filter(finder.is_feed, remote)) - logging.info("Found {0} remote links to feeds.".format(len(urls))) - if len(urls) and not check_all: - return sort_urls(urls) - - # Guessing potential URLs. - fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml", - "index.rss", "index.json"] - urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f) - for f in fns])) - return sort_urls(urls) - - -def url_feed_prob(url): - if "comments" in url: - return -2 - if "georss" in url: - return -1 - kw = ["atom", "rss", "rdf", ".xml", "feed", "json"] - for p, t in zip(list(range(len(kw), 0, -1)), kw): - if t in url: - return p - return 0 - - -def sort_urls(feeds): - return sorted(list(set(feeds)), key=url_feed_prob, reverse=True) - - -if __name__ == "__main__": - print(find_feeds("www.preposterousuniverse.com/blog/")) - print(find_feeds("http://xkcd.com")) - print(find_feeds("dan.iel.fm/atom.xml")) - print(find_feeds("dan.iel.fm", check_all=True)) - print(find_feeds("kapadia.github.io")) - print(find_feeds("blog.jonathansick.ca")) - print(find_feeds("asdasd")) \ No newline at end of file diff --git a/utils/json_functions.py.bak b/utils/json_functions.py.bak deleted file mode 100644 index 699c930c7..000000000 --- a/utils/json_functions.py.bak +++ /dev/null @@ -1,180 +0,0 @@ -#-*- coding: utf-8 -*- -from django.db import models -from django.utils.functional import Promise -from django.utils.encoding import force_unicode, smart_unicode -import json -from decimal import Decimal -from django.core import serializers -from django.conf import settings -from django.http import HttpResponse, HttpResponseForbidden, Http404 -from django.core.mail import mail_admins -from django.db.models.query import QuerySet -from mongoengine.queryset.queryset import QuerySet as MongoQuerySet -from bson.objectid import ObjectId -import sys -import datetime - - -def decode(data): - if not data: - return data - return json.loads(data) - - -def encode(data, *args, **kwargs): - if type(data) == QuerySet: # Careful, ValuesQuerySet is a dict - # Django models - return serializers.serialize("json", data, *args, **kwargs) - else: - return json_encode(data, *args, **kwargs) - - -def json_encode(data, *args, **kwargs): - """ - The main issues with django's default json serializer is that properties that - had been added to an object dynamically are being ignored (and it also has - problems with some models). - """ - - def _any(data): - ret = None - # Opps, we used to check if it is of type list, but that fails - # i.e. in the case of django.newforms.utils.ErrorList, which extends - # the type "list". Oh man, that was a dumb mistake! - if hasattr(data, 'canonical'): - ret = _any(data.canonical()) - elif isinstance(data, list): - ret = _list(data) - elif isinstance(data, set): - ret = _list(list(data)) - # Same as for lists above. - elif isinstance(data, dict): - ret = _dict(data) - elif isinstance(data, (Decimal, ObjectId)): - # json.dumps() cant handle Decimal - ret = str(data) - elif isinstance(data, models.query.QuerySet): - # Actually its the same as a list ... - ret = _list(data) - elif isinstance(data, MongoQuerySet): - # Actually its the same as a list ... - ret = _list(data) - elif isinstance(data, models.Model): - ret = _model(data) - # here we need to encode the string as unicode (otherwise we get utf-16 in the json-response) - elif isinstance(data, basestring): - ret = smart_unicode(data) - elif isinstance(data, Exception): - ret = unicode(data) - # see http://code.djangoproject.com/ticket/5868 - elif isinstance(data, Promise): - ret = force_unicode(data) - elif isinstance(data, datetime.datetime) or isinstance(data, datetime.date): - ret = str(data) - elif hasattr(data, 'to_json'): - ret = data.to_json() - else: - ret = data - return ret - - def _model(data): - ret = {} - # If we only have a model, we only want to encode the fields. - for f in data._meta.fields: - ret[f.attname] = _any(getattr(data, f.attname)) - # And additionally encode arbitrary properties that had been added. - fields = dir(data.__class__) + ret.keys() - add_ons = [k for k in dir(data) if k not in fields] - for k in add_ons: - ret[k] = _any(getattr(data, k)) - return ret - - def _list(data): - ret = [] - for v in data: - ret.append(_any(v)) - return ret - - def _dict(data): - ret = {} - for k, v in data.items(): - ret[str(k)] = _any(v) - return ret - - if hasattr(data, 'to_json'): - data = data.to_json() - ret = _any(data) - return json.dumps(ret) - - -def json_view(func): - def wrap(request, *a, **kw): - response = func(request, *a, **kw) - return json_response(request, response) - - if isinstance(func, HttpResponse): - return func - else: - return wrap - - -def json_response(request, response=None): - code = 200 - - if isinstance(response, HttpResponseForbidden): - return response - - try: - if isinstance(response, dict): - response = dict(response) - if 'result' not in response: - response['result'] = 'ok' - authenticated = request.user.is_authenticated - response['authenticated'] = authenticated - if authenticated: - response['user_id'] = request.user.pk - except KeyboardInterrupt: - # Allow keyboard interrupts through for debugging. - raise - except Http404: - raise Http404 - except Exception as e: - # Mail the admins with the error - exc_info = sys.exc_info() - subject = 'JSON view error: %s' % request.path - try: - request_repr = repr(request) - except: - request_repr = 'Request repr() unavailable' - import traceback - message = 'Traceback:\n%s\n\nRequest:\n%s' % ( - '\n'.join(traceback.format_exception(*exc_info)), - request_repr, - ) - - response = {'result': 'error', - 'text': unicode(e)} - code = 500 - if not settings.DEBUG: - mail_admins(subject, message, fail_silently=True) - else: - print '\n'.join(traceback.format_exception(*exc_info)) - - json = json_encode(response) - return HttpResponse(json, content_type='application/json', status=code) - - -def main(): - test = { - 1: True, - 2: u"string", - 3: 30, - 4: u"юнікод, ўўў, © ™ ® ё ² § $ ° ќо́", - 5: "utf-8: \xd1\x9e, \xc2\xa9 \xe2\x84\xa2 \xc2\xae \xd1\x91 \xd0\xba\xcc\x81\xd0\xbe\xcc\x81", - } - json_test = json_encode(test) - print test, json_test - - -if __name__ == '__main__': - main() diff --git a/utils/s3_utils.py.bak b/utils/s3_utils.py.bak deleted file mode 100644 index 8be10fb90..000000000 --- a/utils/s3_utils.py.bak +++ /dev/null @@ -1,148 +0,0 @@ -import os -import sys -import time -import mimetypes -from boto.s3.connection import S3Connection -from boto.s3.key import Key -from utils.image_functions import ImageOps - -if '/srv/newsblur' not in ' '.join(sys.path): - sys.path.append("/srv/newsblur") - -os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' -from django.conf import settings - -ACCESS_KEY = settings.S3_ACCESS_KEY -SECRET = settings.S3_SECRET -BUCKET_NAME = settings.S3_BACKUP_BUCKET # Note that you need to create this bucket first - -import ssl - -_old_match_hostname = ssl.match_hostname - -def _new_match_hostname(cert, hostname): - if hostname.endswith('.s3.amazonaws.com'): - pos = hostname.find('.s3.amazonaws.com') - hostname = hostname[:pos].replace('.', '') + hostname[pos:] - return _old_match_hostname(cert, hostname) - -ssl.match_hostname = _new_match_hostname - -def save_file_in_s3(filename): - conn = S3Connection(ACCESS_KEY, SECRET) - bucket = conn.get_bucket(BUCKET_NAME) - k = Key(bucket) - k.key = filename - - k.set_contents_from_filename(filename) - -def get_file_from_s3(filename): - conn = S3Connection(ACCESS_KEY, SECRET) - bucket = conn.get_bucket(BUCKET_NAME) - k = Key(bucket) - k.key = filename - - k.get_contents_to_filename(filename) - -def list_backup_in_s3(): - conn = S3Connection(ACCESS_KEY, SECRET) - bucket = conn.get_bucket(BUCKET_NAME) - - for i, key in enumerate(bucket.get_all_keys()): - print "[%s] %s" % (i, key.name) - -def delete_all_backups(): - #FIXME: validate filename exists - conn = S3Connection(ACCESS_KEY, SECRET) - bucket = conn.get_bucket(BUCKET_NAME) - - for i, key in enumerate(bucket.get_all_keys()): - print "deleting %s" % (key.name) - key.delete() - -if __name__ == '__main__': - import sys - if len(sys.argv) < 3: - print 'Usage: %s ' % (sys.argv[0]) - else: - if sys.argv[1] == 'set': - save_file_in_s3(sys.argv[2]) - elif sys.argv[1] == 'get': - get_file_from_s3(sys.argv[2]) - elif sys.argv[1] == 'list': - list_backup_in_s3() - elif sys.argv[1] == 'delete': - delete_all_backups() - else: - print 'Usage: %s ' % (sys.argv[0]) - - -class S3Store: - - def __init__(self, bucket_name=settings.S3_AVATARS_BUCKET_NAME): - if settings.DEBUG: - import ssl - - try: - _create_unverified_https_context = ssl._create_unverified_context - except AttributeError: - # Legacy Python that doesn't verify HTTPS certificates by default - pass - else: - # Handle target environment that doesn't support HTTPS verification - ssl._create_default_https_context = _create_unverified_https_context - - self.s3 = S3Connection(ACCESS_KEY, SECRET) - self.bucket = self.create_bucket(bucket_name) - - def create_bucket(self, bucket_name): - return self.s3.create_bucket(bucket_name) - - def save_profile_picture(self, user_id, filename, image_body): - content_type, extension = self._extract_content_type(filename) - if not content_type or not extension: - return - - image_name = 'profile_%s.%s' % (int(time.time()), extension) - - image = ImageOps.resize_image(image_body, 'fullsize', fit_to_size=False) - if image: - key = 'avatars/%s/large_%s' % (user_id, image_name) - self._save_object(key, image, content_type=content_type) - - image = ImageOps.resize_image(image_body, 'thumbnail', fit_to_size=True) - if image: - key = 'avatars/%s/thumbnail_%s' % (user_id, image_name) - self._save_object(key, image, content_type=content_type) - - return image and image_name - - def _extract_content_type(self, filename): - content_type = mimetypes.guess_type(filename)[0] - extension = None - - if content_type == 'image/jpeg': - extension = 'jpg' - elif content_type == 'image/png': - extension = 'png' - elif content_type == 'image/gif': - extension = 'gif' - - return content_type, extension - - def _make_key(self): - return Key(bucket=self.bucket) - - def _save_object(self, key, file_object, content_type=None): - k = self._make_key() - k.key = key - file_object.seek(0) - - if content_type: - k.set_contents_from_file(file_object, headers={ - 'Content-Type': content_type, - }) - else: - k.set_contents_from_file(file_object) - k.set_acl('public-read') -