Revert "Switching to requests from urllib2/httplib. Bring on the page errors."

This reverts commit 617dc3bb56.
This commit is contained in:
Samuel Clay 2011-11-26 02:16:14 -05:00
parent 93728f8050
commit 7972d0bb5c
5 changed files with 52 additions and 89 deletions

View file

@ -5,7 +5,6 @@ from apps.rss_feeds.models import Feed
from optparse import make_option
from utils import feed_fetcher
from utils.management_functions import daemonize
import django
import socket
import datetime
import redis
@ -76,14 +75,12 @@ class Command(BaseCommand):
feeds_queue = []
for _ in range(num_workers):
feeds_queue.append([])
i = 0
for feed in feeds:
feeds_queue[i%num_workers].append(feed.pk)
i += 1
disp.add_jobs(feeds_queue, i)
django.db.connection.close()
print " ---> Fetching %s feeds..." % feeds.count()
disp.run_jobs()

View file

@ -1,14 +1,18 @@
import requests
import urllib2, httplib
import re
import urlparse
import traceback
import feedparser
import time
from django.conf import settings
from utils import log as logging
from apps.rss_feeds.models import MFeedPage
from utils.feed_functions import timelimit, mail_feed_error_to_admin
HEADERS = {
'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com',
'Connection': 'close',
}
BROKEN_PAGES = [
'tag:',
'info:',
@ -22,18 +26,6 @@ class PageImporter(object):
def __init__(self, url, feed):
self.url = url
self.feed = feed
self.setup_headers()
def setup_headers(self):
s = requests.session()
s.config['keep_alive'] = False
self.headers = {
'User-Agent': 'NewsBlur Page Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
settings.NEWSBLUR_URL
),
}
@timelimit(15)
def fetch_page(self):
@ -43,9 +35,10 @@ class PageImporter(object):
try:
if self.url.startswith('http'):
response = requests.get(self.url, headers=self.headers)
request = urllib2.Request(self.url, headers=HEADERS)
response = urllib2.urlopen(request)
time.sleep(0.01) # Grrr, GIL.
data = response.content
data = response.read()
elif any(self.url.startswith(s) for s in BROKEN_PAGES):
self.save_no_page()
return
@ -53,17 +46,17 @@ class PageImporter(object):
data = open(self.url, 'r').read()
html = self.rewrite_page(data)
self.save_page(html)
# except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e:
# self.feed.save_page_history(401, "Bad URL", e)
# fp = feedparser.parse(self.feed.feed_address)
# self.feed.feed_link = fp.feed.get('link', "")
# self.feed.save()
# except (urllib2.HTTPError), e:
# self.feed.save_page_history(e.code, e.msg, e.fp.read())
# return
# except (httplib.IncompleteRead), e:
# self.feed.save_page_history(500, "IncompleteRead", e)
# return
except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e:
self.feed.save_page_history(401, "Bad URL", e)
fp = feedparser.parse(self.feed.feed_address)
self.feed.feed_link = fp.feed.get('link', "")
self.feed.save()
except (urllib2.HTTPError), e:
self.feed.save_page_history(e.code, e.msg, e.fp.read())
return
except (httplib.IncompleteRead), e:
self.feed.save_page_history(500, "IncompleteRead", e)
return
except Exception, e:
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
tb = traceback.format_exc()

View file

@ -1,46 +1,5 @@
import logging
# ===================
# = Server Settings =
# ===================
ADMINS = (
('Samuel Clay', 'samuel@ofbrooklyn.com'),
)
SERVER_EMAIL = 'server@newsblur.com'
HELLO_EMAIL = 'hello@newsblur.com'
NEWSBLUR_URL = 'http://www.newsblur.com'
# ==================
# = Global Settngs =
# ==================
DEBUG = True
MEDIA_URL = '/media/'
SECRET_KEY = 'YOUR SECRET KEY'
CACHE_BACKEND = 'dummy:///'
# CACHE_BACKEND = 'locmem:///'
# CACHE_BACKEND = 'memcached://127.0.0.1:11211'
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
# Set this to the username that is shown on the homepage to unauthenticated users.
HOMEPAGE_USERNAME = 'conesus'
# Google Reader OAuth API Keys
OAUTH_KEY = 'www.example.com'
OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE'
S3_ACCESS_KEY = 'XXX'
S3_SECRET = 'SECRET'
S3_BACKUP_BUCKET = 'newsblur_backups'
# =============
# = Databases =
# =============
DATABASES = {
'default': {
'NAME': 'newsblur',
@ -61,13 +20,28 @@ MONGODB_SLAVE = {
'host': '127.0.0.1'
}
DEBUG = True
MEDIA_URL = '/media/'
SECRET_KEY = 'YOUR SECRET KEY'
CACHE_BACKEND = 'dummy:///'
# CACHE_BACKEND = 'locmem:///'
# CACHE_BACKEND = 'memcached://127.0.0.1:11211'
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
# Set this to the username that is shown on the homepage to unauthenticated users.
HOMEPAGE_USERNAME = 'conesus'
# Google Reader OAuth API Keys
OAUTH_KEY = 'www.example.com'
OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE'
# Celery RabbitMQ Broker
BROKER_HOST = "127.0.0.1"
# ===========
# = Logging =
# ===========
# Logging (setup for development)
LOG_TO_STREAM = True
@ -77,3 +51,7 @@ if len(logging._handlerList) < 1:
format='%(asctime)-12s: %(message)s',
datefmt='%b %d %H:%M:%S',
handler=logging.StreamHandler)
S3_ACCESS_KEY = 'XXX'
S3_SECRET = 'SECRET'
S3_BACKUP_BUCKET = 'newsblur_backups'

View file

@ -4,18 +4,6 @@ import os
from mongoengine import connect
import redis
# ===================
# = Server Settings =
# ===================
ADMINS = (
('Samuel Clay', 'samuel@ofbrooklyn.com'),
)
SERVER_EMAIL = 'server@newsblur.com'
HELLO_EMAIL = 'hello@newsblur.com'
NEWSBLUR_URL = 'http://www.newsblur.com'
# ===========================
# = Directory Declaractions =
# ===========================
@ -37,10 +25,14 @@ if '/utils' not in ' '.join(sys.path):
sys.path.append(UTILS_ROOT)
if '/vendor' not in ' '.join(sys.path):
sys.path.append(VENDOR_ROOT)
# ===================
# = Global Settings =
# ===================
ADMINS = (
('Samuel Clay', 'samuel@ofbrooklyn.com'),
)
TEST_DEBUG = False
SEND_BROKEN_LINK_EMAILS = False
MANAGERS = ADMINS
@ -313,6 +305,8 @@ SESSION_ENGINE = "django.contrib.sessions.backends.db"
TEST_RUNNER = "utils.testrunner.TestRunner"
SESSION_COOKIE_NAME = 'newsblur_sessionid'
SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years
SERVER_EMAIL = 'server@newsblur.com'
HELLO_EMAIL = 'hello@newsblur.com'
# ===========
# = Logging =

View file

@ -23,6 +23,7 @@ import redis
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
URL = 'http://www.newsblur.com/'
SLOWFEED_WARNING = 10
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
@ -62,7 +63,7 @@ class FetchFeed:
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
settings.NEWSBLUR_URL
URL
)
self.fpf = feedparser.parse(self.feed.feed_address,