Switching to requests from urllib2/httplib. Bring on the page errors.

This commit is contained in:
Samuel Clay 2011-11-26 02:07:31 -05:00
parent 3cf4a8a894
commit 617dc3bb56
5 changed files with 89 additions and 52 deletions

View file

@ -5,6 +5,7 @@ from apps.rss_feeds.models import Feed
from optparse import make_option from optparse import make_option
from utils import feed_fetcher from utils import feed_fetcher
from utils.management_functions import daemonize from utils.management_functions import daemonize
import django
import socket import socket
import datetime import datetime
import redis import redis
@ -75,12 +76,14 @@ class Command(BaseCommand):
feeds_queue = [] feeds_queue = []
for _ in range(num_workers): for _ in range(num_workers):
feeds_queue.append([]) feeds_queue.append([])
i = 0 i = 0
for feed in feeds: for feed in feeds:
feeds_queue[i%num_workers].append(feed.pk) feeds_queue[i%num_workers].append(feed.pk)
i += 1 i += 1
disp.add_jobs(feeds_queue, i) disp.add_jobs(feeds_queue, i)
django.db.connection.close()
print " ---> Fetching %s feeds..." % feeds.count() print " ---> Fetching %s feeds..." % feeds.count()
disp.run_jobs() disp.run_jobs()

View file

@ -1,18 +1,14 @@
import urllib2, httplib import requests
import re import re
import urlparse import urlparse
import traceback import traceback
import feedparser import feedparser
import time import time
from django.conf import settings
from utils import log as logging from utils import log as logging
from apps.rss_feeds.models import MFeedPage from apps.rss_feeds.models import MFeedPage
from utils.feed_functions import timelimit, mail_feed_error_to_admin from utils.feed_functions import timelimit, mail_feed_error_to_admin
HEADERS = {
'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com',
'Connection': 'close',
}
BROKEN_PAGES = [ BROKEN_PAGES = [
'tag:', 'tag:',
'info:', 'info:',
@ -26,6 +22,18 @@ class PageImporter(object):
def __init__(self, url, feed): def __init__(self, url, feed):
self.url = url self.url = url
self.feed = feed self.feed = feed
self.setup_headers()
def setup_headers(self):
s = requests.session()
s.config['keep_alive'] = False
self.headers = {
'User-Agent': 'NewsBlur Page Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
settings.NEWSBLUR_URL
),
}
@timelimit(15) @timelimit(15)
def fetch_page(self): def fetch_page(self):
@ -35,10 +43,9 @@ class PageImporter(object):
try: try:
if self.url.startswith('http'): if self.url.startswith('http'):
request = urllib2.Request(self.url, headers=HEADERS) response = requests.get(self.url, headers=self.headers)
response = urllib2.urlopen(request)
time.sleep(0.01) # Grrr, GIL. time.sleep(0.01) # Grrr, GIL.
data = response.read() data = response.content
elif any(self.url.startswith(s) for s in BROKEN_PAGES): elif any(self.url.startswith(s) for s in BROKEN_PAGES):
self.save_no_page() self.save_no_page()
return return
@ -46,17 +53,17 @@ class PageImporter(object):
data = open(self.url, 'r').read() data = open(self.url, 'r').read()
html = self.rewrite_page(data) html = self.rewrite_page(data)
self.save_page(html) self.save_page(html)
except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e: # except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e:
self.feed.save_page_history(401, "Bad URL", e) # self.feed.save_page_history(401, "Bad URL", e)
fp = feedparser.parse(self.feed.feed_address) # fp = feedparser.parse(self.feed.feed_address)
self.feed.feed_link = fp.feed.get('link', "") # self.feed.feed_link = fp.feed.get('link', "")
self.feed.save() # self.feed.save()
except (urllib2.HTTPError), e: # except (urllib2.HTTPError), e:
self.feed.save_page_history(e.code, e.msg, e.fp.read()) # self.feed.save_page_history(e.code, e.msg, e.fp.read())
return # return
except (httplib.IncompleteRead), e: # except (httplib.IncompleteRead), e:
self.feed.save_page_history(500, "IncompleteRead", e) # self.feed.save_page_history(500, "IncompleteRead", e)
return # return
except Exception, e: except Exception, e:
logging.debug('[%d] ! -------------------------' % (self.feed.id,)) logging.debug('[%d] ! -------------------------' % (self.feed.id,))
tb = traceback.format_exc() tb = traceback.format_exc()

View file

@ -1,5 +1,46 @@
import logging import logging
# ===================
# = Server Settings =
# ===================
ADMINS = (
('Samuel Clay', 'samuel@ofbrooklyn.com'),
)
SERVER_EMAIL = 'server@newsblur.com'
HELLO_EMAIL = 'hello@newsblur.com'
NEWSBLUR_URL = 'http://www.newsblur.com'
# ==================
# = Global Settngs =
# ==================
DEBUG = True
MEDIA_URL = '/media/'
SECRET_KEY = 'YOUR SECRET KEY'
CACHE_BACKEND = 'dummy:///'
# CACHE_BACKEND = 'locmem:///'
# CACHE_BACKEND = 'memcached://127.0.0.1:11211'
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
# Set this to the username that is shown on the homepage to unauthenticated users.
HOMEPAGE_USERNAME = 'conesus'
# Google Reader OAuth API Keys
OAUTH_KEY = 'www.example.com'
OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE'
S3_ACCESS_KEY = 'XXX'
S3_SECRET = 'SECRET'
S3_BACKUP_BUCKET = 'newsblur_backups'
# =============
# = Databases =
# =============
DATABASES = { DATABASES = {
'default': { 'default': {
'NAME': 'newsblur', 'NAME': 'newsblur',
@ -20,28 +61,13 @@ MONGODB_SLAVE = {
'host': '127.0.0.1' 'host': '127.0.0.1'
} }
DEBUG = True
MEDIA_URL = '/media/'
SECRET_KEY = 'YOUR SECRET KEY'
CACHE_BACKEND = 'dummy:///'
# CACHE_BACKEND = 'locmem:///'
# CACHE_BACKEND = 'memcached://127.0.0.1:11211'
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
# Set this to the username that is shown on the homepage to unauthenticated users.
HOMEPAGE_USERNAME = 'conesus'
# Google Reader OAuth API Keys
OAUTH_KEY = 'www.example.com'
OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE'
# Celery RabbitMQ Broker # Celery RabbitMQ Broker
BROKER_HOST = "127.0.0.1" BROKER_HOST = "127.0.0.1"
# ===========
# = Logging =
# ===========
# Logging (setup for development) # Logging (setup for development)
LOG_TO_STREAM = True LOG_TO_STREAM = True
@ -51,7 +77,3 @@ if len(logging._handlerList) < 1:
format='%(asctime)-12s: %(message)s', format='%(asctime)-12s: %(message)s',
datefmt='%b %d %H:%M:%S', datefmt='%b %d %H:%M:%S',
handler=logging.StreamHandler) handler=logging.StreamHandler)
S3_ACCESS_KEY = 'XXX'
S3_SECRET = 'SECRET'
S3_BACKUP_BUCKET = 'newsblur_backups'

View file

@ -4,6 +4,18 @@ import os
from mongoengine import connect from mongoengine import connect
import redis import redis
# ===================
# = Server Settings =
# ===================
ADMINS = (
('Samuel Clay', 'samuel@ofbrooklyn.com'),
)
SERVER_EMAIL = 'server@newsblur.com'
HELLO_EMAIL = 'hello@newsblur.com'
NEWSBLUR_URL = 'http://www.newsblur.com'
# =========================== # ===========================
# = Directory Declaractions = # = Directory Declaractions =
# =========================== # ===========================
@ -25,14 +37,10 @@ if '/utils' not in ' '.join(sys.path):
sys.path.append(UTILS_ROOT) sys.path.append(UTILS_ROOT)
if '/vendor' not in ' '.join(sys.path): if '/vendor' not in ' '.join(sys.path):
sys.path.append(VENDOR_ROOT) sys.path.append(VENDOR_ROOT)
# =================== # ===================
# = Global Settings = # = Global Settings =
# =================== # ===================
ADMINS = (
('Samuel Clay', 'samuel@ofbrooklyn.com'),
)
TEST_DEBUG = False TEST_DEBUG = False
SEND_BROKEN_LINK_EMAILS = False SEND_BROKEN_LINK_EMAILS = False
MANAGERS = ADMINS MANAGERS = ADMINS
@ -305,8 +313,6 @@ SESSION_ENGINE = "django.contrib.sessions.backends.db"
TEST_RUNNER = "utils.testrunner.TestRunner" TEST_RUNNER = "utils.testrunner.TestRunner"
SESSION_COOKIE_NAME = 'newsblur_sessionid' SESSION_COOKIE_NAME = 'newsblur_sessionid'
SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years
SERVER_EMAIL = 'server@newsblur.com'
HELLO_EMAIL = 'hello@newsblur.com'
# =========== # ===========
# = Logging = # = Logging =

View file

@ -23,7 +23,6 @@ import redis
# Refresh feed code adapted from Feedjack. # Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com # http://feedjack.googlecode.com
URL = 'http://www.newsblur.com/'
SLOWFEED_WARNING = 10 SLOWFEED_WARNING = 10
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5) FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
@ -63,7 +62,7 @@ class FetchFeed:
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % ( USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
self.feed.num_subscribers, self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '', 's' if self.feed.num_subscribers != 1 else '',
URL settings.NEWSBLUR_URL
) )
self.fpf = feedparser.parse(self.feed.feed_address, self.fpf = feedparser.parse(self.feed.feed_address,