From 7972d0bb5c4f9dac61064bb05fe6586c37820061 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Sat, 26 Nov 2011 02:16:14 -0500 Subject: [PATCH] Revert "Switching to requests from urllib2/httplib. Bring on the page errors." This reverts commit 617dc3bb56c6962387cc8d2a7501beb63e8fab29. --- .../management/commands/refresh_feeds.py | 5 +- apps/rss_feeds/page_importer.py | 47 ++++++------- local_settings.py.template | 68 +++++++------------ settings.py | 18 ++--- utils/feed_fetcher.py | 3 +- 5 files changed, 52 insertions(+), 89 deletions(-) diff --git a/apps/rss_feeds/management/commands/refresh_feeds.py b/apps/rss_feeds/management/commands/refresh_feeds.py index 8c4732963..db88c45fe 100644 --- a/apps/rss_feeds/management/commands/refresh_feeds.py +++ b/apps/rss_feeds/management/commands/refresh_feeds.py @@ -5,7 +5,6 @@ from apps.rss_feeds.models import Feed from optparse import make_option from utils import feed_fetcher from utils.management_functions import daemonize -import django import socket import datetime import redis @@ -76,14 +75,12 @@ class Command(BaseCommand): feeds_queue = [] for _ in range(num_workers): feeds_queue.append([]) - + i = 0 for feed in feeds: feeds_queue[i%num_workers].append(feed.pk) i += 1 disp.add_jobs(feeds_queue, i) - django.db.connection.close() - print " ---> Fetching %s feeds..." % feeds.count() disp.run_jobs() diff --git a/apps/rss_feeds/page_importer.py b/apps/rss_feeds/page_importer.py index 1e653eef2..9e48a4a05 100644 --- a/apps/rss_feeds/page_importer.py +++ b/apps/rss_feeds/page_importer.py @@ -1,14 +1,18 @@ -import requests +import urllib2, httplib import re import urlparse import traceback import feedparser import time -from django.conf import settings from utils import log as logging from apps.rss_feeds.models import MFeedPage from utils.feed_functions import timelimit, mail_feed_error_to_admin +HEADERS = { + 'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com', + 'Connection': 'close', +} + BROKEN_PAGES = [ 'tag:', 'info:', @@ -22,18 +26,6 @@ class PageImporter(object): def __init__(self, url, feed): self.url = url self.feed = feed - self.setup_headers() - - def setup_headers(self): - s = requests.session() - s.config['keep_alive'] = False - self.headers = { - 'User-Agent': 'NewsBlur Page Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % ( - self.feed.num_subscribers, - 's' if self.feed.num_subscribers != 1 else '', - settings.NEWSBLUR_URL - ), - } @timelimit(15) def fetch_page(self): @@ -43,9 +35,10 @@ class PageImporter(object): try: if self.url.startswith('http'): - response = requests.get(self.url, headers=self.headers) + request = urllib2.Request(self.url, headers=HEADERS) + response = urllib2.urlopen(request) time.sleep(0.01) # Grrr, GIL. - data = response.content + data = response.read() elif any(self.url.startswith(s) for s in BROKEN_PAGES): self.save_no_page() return @@ -53,17 +46,17 @@ class PageImporter(object): data = open(self.url, 'r').read() html = self.rewrite_page(data) self.save_page(html) - # except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e: - # self.feed.save_page_history(401, "Bad URL", e) - # fp = feedparser.parse(self.feed.feed_address) - # self.feed.feed_link = fp.feed.get('link', "") - # self.feed.save() - # except (urllib2.HTTPError), e: - # self.feed.save_page_history(e.code, e.msg, e.fp.read()) - # return - # except (httplib.IncompleteRead), e: - # self.feed.save_page_history(500, "IncompleteRead", e) - # return + except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e: + self.feed.save_page_history(401, "Bad URL", e) + fp = feedparser.parse(self.feed.feed_address) + self.feed.feed_link = fp.feed.get('link', "") + self.feed.save() + except (urllib2.HTTPError), e: + self.feed.save_page_history(e.code, e.msg, e.fp.read()) + return + except (httplib.IncompleteRead), e: + self.feed.save_page_history(500, "IncompleteRead", e) + return except Exception, e: logging.debug('[%d] ! -------------------------' % (self.feed.id,)) tb = traceback.format_exc() diff --git a/local_settings.py.template b/local_settings.py.template index 674103555..2970894bd 100644 --- a/local_settings.py.template +++ b/local_settings.py.template @@ -1,46 +1,5 @@ import logging -# =================== -# = Server Settings = -# =================== - -ADMINS = ( - ('Samuel Clay', 'samuel@ofbrooklyn.com'), -) - -SERVER_EMAIL = 'server@newsblur.com' -HELLO_EMAIL = 'hello@newsblur.com' -NEWSBLUR_URL = 'http://www.newsblur.com' - -# ================== -# = Global Settngs = -# ================== - -DEBUG = True -MEDIA_URL = '/media/' -SECRET_KEY = 'YOUR SECRET KEY' - -CACHE_BACKEND = 'dummy:///' -# CACHE_BACKEND = 'locmem:///' -# CACHE_BACKEND = 'memcached://127.0.0.1:11211' - -EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' - -# Set this to the username that is shown on the homepage to unauthenticated users. -HOMEPAGE_USERNAME = 'conesus' - -# Google Reader OAuth API Keys -OAUTH_KEY = 'www.example.com' -OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE' - -S3_ACCESS_KEY = 'XXX' -S3_SECRET = 'SECRET' -S3_BACKUP_BUCKET = 'newsblur_backups' - -# ============= -# = Databases = -# ============= - DATABASES = { 'default': { 'NAME': 'newsblur', @@ -61,13 +20,28 @@ MONGODB_SLAVE = { 'host': '127.0.0.1' } +DEBUG = True + +MEDIA_URL = '/media/' + +SECRET_KEY = 'YOUR SECRET KEY' + +CACHE_BACKEND = 'dummy:///' +# CACHE_BACKEND = 'locmem:///' +# CACHE_BACKEND = 'memcached://127.0.0.1:11211' + +EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' + +# Set this to the username that is shown on the homepage to unauthenticated users. +HOMEPAGE_USERNAME = 'conesus' + +# Google Reader OAuth API Keys +OAUTH_KEY = 'www.example.com' +OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE' + # Celery RabbitMQ Broker BROKER_HOST = "127.0.0.1" -# =========== -# = Logging = -# =========== - # Logging (setup for development) LOG_TO_STREAM = True @@ -77,3 +51,7 @@ if len(logging._handlerList) < 1: format='%(asctime)-12s: %(message)s', datefmt='%b %d %H:%M:%S', handler=logging.StreamHandler) + +S3_ACCESS_KEY = 'XXX' +S3_SECRET = 'SECRET' +S3_BACKUP_BUCKET = 'newsblur_backups' diff --git a/settings.py b/settings.py index 8d8d77dcc..e31f65e4d 100644 --- a/settings.py +++ b/settings.py @@ -4,18 +4,6 @@ import os from mongoengine import connect import redis -# =================== -# = Server Settings = -# =================== - -ADMINS = ( - ('Samuel Clay', 'samuel@ofbrooklyn.com'), -) - -SERVER_EMAIL = 'server@newsblur.com' -HELLO_EMAIL = 'hello@newsblur.com' -NEWSBLUR_URL = 'http://www.newsblur.com' - # =========================== # = Directory Declaractions = # =========================== @@ -37,10 +25,14 @@ if '/utils' not in ' '.join(sys.path): sys.path.append(UTILS_ROOT) if '/vendor' not in ' '.join(sys.path): sys.path.append(VENDOR_ROOT) + # =================== # = Global Settings = # =================== +ADMINS = ( + ('Samuel Clay', 'samuel@ofbrooklyn.com'), +) TEST_DEBUG = False SEND_BROKEN_LINK_EMAILS = False MANAGERS = ADMINS @@ -313,6 +305,8 @@ SESSION_ENGINE = "django.contrib.sessions.backends.db" TEST_RUNNER = "utils.testrunner.TestRunner" SESSION_COOKIE_NAME = 'newsblur_sessionid' SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years +SERVER_EMAIL = 'server@newsblur.com' +HELLO_EMAIL = 'hello@newsblur.com' # =========== # = Logging = diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 6956891ee..9441336f4 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -23,6 +23,7 @@ import redis # Refresh feed code adapted from Feedjack. # http://feedjack.googlecode.com +URL = 'http://www.newsblur.com/' SLOWFEED_WARNING = 10 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5) @@ -62,7 +63,7 @@ class FetchFeed: USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', - settings.NEWSBLUR_URL + URL ) self.fpf = feedparser.parse(self.feed.feed_address,