diff --git a/apps/rss_feeds/management/commands/refresh_feeds.py b/apps/rss_feeds/management/commands/refresh_feeds.py index db88c45fe..8c4732963 100644 --- a/apps/rss_feeds/management/commands/refresh_feeds.py +++ b/apps/rss_feeds/management/commands/refresh_feeds.py @@ -5,6 +5,7 @@ from apps.rss_feeds.models import Feed from optparse import make_option from utils import feed_fetcher from utils.management_functions import daemonize +import django import socket import datetime import redis @@ -75,12 +76,14 @@ class Command(BaseCommand): feeds_queue = [] for _ in range(num_workers): feeds_queue.append([]) - + i = 0 for feed in feeds: feeds_queue[i%num_workers].append(feed.pk) i += 1 disp.add_jobs(feeds_queue, i) + django.db.connection.close() + print " ---> Fetching %s feeds..." % feeds.count() disp.run_jobs() diff --git a/apps/rss_feeds/page_importer.py b/apps/rss_feeds/page_importer.py index 9e48a4a05..1e653eef2 100644 --- a/apps/rss_feeds/page_importer.py +++ b/apps/rss_feeds/page_importer.py @@ -1,18 +1,14 @@ -import urllib2, httplib +import requests import re import urlparse import traceback import feedparser import time +from django.conf import settings from utils import log as logging from apps.rss_feeds.models import MFeedPage from utils.feed_functions import timelimit, mail_feed_error_to_admin -HEADERS = { - 'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com', - 'Connection': 'close', -} - BROKEN_PAGES = [ 'tag:', 'info:', @@ -26,6 +22,18 @@ class PageImporter(object): def __init__(self, url, feed): self.url = url self.feed = feed + self.setup_headers() + + def setup_headers(self): + s = requests.session() + s.config['keep_alive'] = False + self.headers = { + 'User-Agent': 'NewsBlur Page Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % ( + self.feed.num_subscribers, + 's' if self.feed.num_subscribers != 1 else '', + settings.NEWSBLUR_URL + ), + } @timelimit(15) def fetch_page(self): @@ -35,10 +43,9 @@ class PageImporter(object): try: if self.url.startswith('http'): - request = urllib2.Request(self.url, headers=HEADERS) - response = urllib2.urlopen(request) + response = requests.get(self.url, headers=self.headers) time.sleep(0.01) # Grrr, GIL. - data = response.read() + data = response.content elif any(self.url.startswith(s) for s in BROKEN_PAGES): self.save_no_page() return @@ -46,17 +53,17 @@ class PageImporter(object): data = open(self.url, 'r').read() html = self.rewrite_page(data) self.save_page(html) - except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e: - self.feed.save_page_history(401, "Bad URL", e) - fp = feedparser.parse(self.feed.feed_address) - self.feed.feed_link = fp.feed.get('link', "") - self.feed.save() - except (urllib2.HTTPError), e: - self.feed.save_page_history(e.code, e.msg, e.fp.read()) - return - except (httplib.IncompleteRead), e: - self.feed.save_page_history(500, "IncompleteRead", e) - return + # except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e: + # self.feed.save_page_history(401, "Bad URL", e) + # fp = feedparser.parse(self.feed.feed_address) + # self.feed.feed_link = fp.feed.get('link', "") + # self.feed.save() + # except (urllib2.HTTPError), e: + # self.feed.save_page_history(e.code, e.msg, e.fp.read()) + # return + # except (httplib.IncompleteRead), e: + # self.feed.save_page_history(500, "IncompleteRead", e) + # return except Exception, e: logging.debug('[%d] ! -------------------------' % (self.feed.id,)) tb = traceback.format_exc() diff --git a/local_settings.py.template b/local_settings.py.template index 2970894bd..674103555 100644 --- a/local_settings.py.template +++ b/local_settings.py.template @@ -1,5 +1,46 @@ import logging +# =================== +# = Server Settings = +# =================== + +ADMINS = ( + ('Samuel Clay', 'samuel@ofbrooklyn.com'), +) + +SERVER_EMAIL = 'server@newsblur.com' +HELLO_EMAIL = 'hello@newsblur.com' +NEWSBLUR_URL = 'http://www.newsblur.com' + +# ================== +# = Global Settngs = +# ================== + +DEBUG = True +MEDIA_URL = '/media/' +SECRET_KEY = 'YOUR SECRET KEY' + +CACHE_BACKEND = 'dummy:///' +# CACHE_BACKEND = 'locmem:///' +# CACHE_BACKEND = 'memcached://127.0.0.1:11211' + +EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' + +# Set this to the username that is shown on the homepage to unauthenticated users. +HOMEPAGE_USERNAME = 'conesus' + +# Google Reader OAuth API Keys +OAUTH_KEY = 'www.example.com' +OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE' + +S3_ACCESS_KEY = 'XXX' +S3_SECRET = 'SECRET' +S3_BACKUP_BUCKET = 'newsblur_backups' + +# ============= +# = Databases = +# ============= + DATABASES = { 'default': { 'NAME': 'newsblur', @@ -20,28 +61,13 @@ MONGODB_SLAVE = { 'host': '127.0.0.1' } -DEBUG = True - -MEDIA_URL = '/media/' - -SECRET_KEY = 'YOUR SECRET KEY' - -CACHE_BACKEND = 'dummy:///' -# CACHE_BACKEND = 'locmem:///' -# CACHE_BACKEND = 'memcached://127.0.0.1:11211' - -EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' - -# Set this to the username that is shown on the homepage to unauthenticated users. -HOMEPAGE_USERNAME = 'conesus' - -# Google Reader OAuth API Keys -OAUTH_KEY = 'www.example.com' -OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE' - # Celery RabbitMQ Broker BROKER_HOST = "127.0.0.1" +# =========== +# = Logging = +# =========== + # Logging (setup for development) LOG_TO_STREAM = True @@ -51,7 +77,3 @@ if len(logging._handlerList) < 1: format='%(asctime)-12s: %(message)s', datefmt='%b %d %H:%M:%S', handler=logging.StreamHandler) - -S3_ACCESS_KEY = 'XXX' -S3_SECRET = 'SECRET' -S3_BACKUP_BUCKET = 'newsblur_backups' diff --git a/settings.py b/settings.py index e31f65e4d..8d8d77dcc 100644 --- a/settings.py +++ b/settings.py @@ -4,6 +4,18 @@ import os from mongoengine import connect import redis +# =================== +# = Server Settings = +# =================== + +ADMINS = ( + ('Samuel Clay', 'samuel@ofbrooklyn.com'), +) + +SERVER_EMAIL = 'server@newsblur.com' +HELLO_EMAIL = 'hello@newsblur.com' +NEWSBLUR_URL = 'http://www.newsblur.com' + # =========================== # = Directory Declaractions = # =========================== @@ -25,14 +37,10 @@ if '/utils' not in ' '.join(sys.path): sys.path.append(UTILS_ROOT) if '/vendor' not in ' '.join(sys.path): sys.path.append(VENDOR_ROOT) - # =================== # = Global Settings = # =================== -ADMINS = ( - ('Samuel Clay', 'samuel@ofbrooklyn.com'), -) TEST_DEBUG = False SEND_BROKEN_LINK_EMAILS = False MANAGERS = ADMINS @@ -305,8 +313,6 @@ SESSION_ENGINE = "django.contrib.sessions.backends.db" TEST_RUNNER = "utils.testrunner.TestRunner" SESSION_COOKIE_NAME = 'newsblur_sessionid' SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years -SERVER_EMAIL = 'server@newsblur.com' -HELLO_EMAIL = 'hello@newsblur.com' # =========== # = Logging = diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 9441336f4..6956891ee 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -23,7 +23,6 @@ import redis # Refresh feed code adapted from Feedjack. # http://feedjack.googlecode.com -URL = 'http://www.newsblur.com/' SLOWFEED_WARNING = 10 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5) @@ -63,7 +62,7 @@ class FetchFeed: USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', - URL + settings.NEWSBLUR_URL ) self.fpf = feedparser.parse(self.feed.feed_address,