mirror of
https://github.com/viq/NewsBlur.git
synced 2025-08-31 22:20:12 +00:00
Switching to requests from urllib2/httplib. Bring on the page errors.
This commit is contained in:
parent
3cf4a8a894
commit
617dc3bb56
5 changed files with 89 additions and 52 deletions
|
@ -5,6 +5,7 @@ from apps.rss_feeds.models import Feed
|
||||||
from optparse import make_option
|
from optparse import make_option
|
||||||
from utils import feed_fetcher
|
from utils import feed_fetcher
|
||||||
from utils.management_functions import daemonize
|
from utils.management_functions import daemonize
|
||||||
|
import django
|
||||||
import socket
|
import socket
|
||||||
import datetime
|
import datetime
|
||||||
import redis
|
import redis
|
||||||
|
@ -75,12 +76,14 @@ class Command(BaseCommand):
|
||||||
feeds_queue = []
|
feeds_queue = []
|
||||||
for _ in range(num_workers):
|
for _ in range(num_workers):
|
||||||
feeds_queue.append([])
|
feeds_queue.append([])
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
feeds_queue[i%num_workers].append(feed.pk)
|
feeds_queue[i%num_workers].append(feed.pk)
|
||||||
i += 1
|
i += 1
|
||||||
disp.add_jobs(feeds_queue, i)
|
disp.add_jobs(feeds_queue, i)
|
||||||
|
|
||||||
|
django.db.connection.close()
|
||||||
|
|
||||||
print " ---> Fetching %s feeds..." % feeds.count()
|
print " ---> Fetching %s feeds..." % feeds.count()
|
||||||
disp.run_jobs()
|
disp.run_jobs()
|
||||||
|
|
|
@ -1,18 +1,14 @@
|
||||||
import urllib2, httplib
|
import requests
|
||||||
import re
|
import re
|
||||||
import urlparse
|
import urlparse
|
||||||
import traceback
|
import traceback
|
||||||
import feedparser
|
import feedparser
|
||||||
import time
|
import time
|
||||||
|
from django.conf import settings
|
||||||
from utils import log as logging
|
from utils import log as logging
|
||||||
from apps.rss_feeds.models import MFeedPage
|
from apps.rss_feeds.models import MFeedPage
|
||||||
from utils.feed_functions import timelimit, mail_feed_error_to_admin
|
from utils.feed_functions import timelimit, mail_feed_error_to_admin
|
||||||
|
|
||||||
HEADERS = {
|
|
||||||
'User-Agent': 'NewsBlur Page Fetcher - http://www.newsblur.com',
|
|
||||||
'Connection': 'close',
|
|
||||||
}
|
|
||||||
|
|
||||||
BROKEN_PAGES = [
|
BROKEN_PAGES = [
|
||||||
'tag:',
|
'tag:',
|
||||||
'info:',
|
'info:',
|
||||||
|
@ -26,6 +22,18 @@ class PageImporter(object):
|
||||||
def __init__(self, url, feed):
|
def __init__(self, url, feed):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.feed = feed
|
self.feed = feed
|
||||||
|
self.setup_headers()
|
||||||
|
|
||||||
|
def setup_headers(self):
|
||||||
|
s = requests.session()
|
||||||
|
s.config['keep_alive'] = False
|
||||||
|
self.headers = {
|
||||||
|
'User-Agent': 'NewsBlur Page Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
|
||||||
|
self.feed.num_subscribers,
|
||||||
|
's' if self.feed.num_subscribers != 1 else '',
|
||||||
|
settings.NEWSBLUR_URL
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
@timelimit(15)
|
@timelimit(15)
|
||||||
def fetch_page(self):
|
def fetch_page(self):
|
||||||
|
@ -35,10 +43,9 @@ class PageImporter(object):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.url.startswith('http'):
|
if self.url.startswith('http'):
|
||||||
request = urllib2.Request(self.url, headers=HEADERS)
|
response = requests.get(self.url, headers=self.headers)
|
||||||
response = urllib2.urlopen(request)
|
|
||||||
time.sleep(0.01) # Grrr, GIL.
|
time.sleep(0.01) # Grrr, GIL.
|
||||||
data = response.read()
|
data = response.content
|
||||||
elif any(self.url.startswith(s) for s in BROKEN_PAGES):
|
elif any(self.url.startswith(s) for s in BROKEN_PAGES):
|
||||||
self.save_no_page()
|
self.save_no_page()
|
||||||
return
|
return
|
||||||
|
@ -46,17 +53,17 @@ class PageImporter(object):
|
||||||
data = open(self.url, 'r').read()
|
data = open(self.url, 'r').read()
|
||||||
html = self.rewrite_page(data)
|
html = self.rewrite_page(data)
|
||||||
self.save_page(html)
|
self.save_page(html)
|
||||||
except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e:
|
# except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e:
|
||||||
self.feed.save_page_history(401, "Bad URL", e)
|
# self.feed.save_page_history(401, "Bad URL", e)
|
||||||
fp = feedparser.parse(self.feed.feed_address)
|
# fp = feedparser.parse(self.feed.feed_address)
|
||||||
self.feed.feed_link = fp.feed.get('link', "")
|
# self.feed.feed_link = fp.feed.get('link', "")
|
||||||
self.feed.save()
|
# self.feed.save()
|
||||||
except (urllib2.HTTPError), e:
|
# except (urllib2.HTTPError), e:
|
||||||
self.feed.save_page_history(e.code, e.msg, e.fp.read())
|
# self.feed.save_page_history(e.code, e.msg, e.fp.read())
|
||||||
return
|
# return
|
||||||
except (httplib.IncompleteRead), e:
|
# except (httplib.IncompleteRead), e:
|
||||||
self.feed.save_page_history(500, "IncompleteRead", e)
|
# self.feed.save_page_history(500, "IncompleteRead", e)
|
||||||
return
|
# return
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
||||||
tb = traceback.format_exc()
|
tb = traceback.format_exc()
|
||||||
|
|
|
@ -1,5 +1,46 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
# ===================
|
||||||
|
# = Server Settings =
|
||||||
|
# ===================
|
||||||
|
|
||||||
|
ADMINS = (
|
||||||
|
('Samuel Clay', 'samuel@ofbrooklyn.com'),
|
||||||
|
)
|
||||||
|
|
||||||
|
SERVER_EMAIL = 'server@newsblur.com'
|
||||||
|
HELLO_EMAIL = 'hello@newsblur.com'
|
||||||
|
NEWSBLUR_URL = 'http://www.newsblur.com'
|
||||||
|
|
||||||
|
# ==================
|
||||||
|
# = Global Settngs =
|
||||||
|
# ==================
|
||||||
|
|
||||||
|
DEBUG = True
|
||||||
|
MEDIA_URL = '/media/'
|
||||||
|
SECRET_KEY = 'YOUR SECRET KEY'
|
||||||
|
|
||||||
|
CACHE_BACKEND = 'dummy:///'
|
||||||
|
# CACHE_BACKEND = 'locmem:///'
|
||||||
|
# CACHE_BACKEND = 'memcached://127.0.0.1:11211'
|
||||||
|
|
||||||
|
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||||
|
|
||||||
|
# Set this to the username that is shown on the homepage to unauthenticated users.
|
||||||
|
HOMEPAGE_USERNAME = 'conesus'
|
||||||
|
|
||||||
|
# Google Reader OAuth API Keys
|
||||||
|
OAUTH_KEY = 'www.example.com'
|
||||||
|
OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE'
|
||||||
|
|
||||||
|
S3_ACCESS_KEY = 'XXX'
|
||||||
|
S3_SECRET = 'SECRET'
|
||||||
|
S3_BACKUP_BUCKET = 'newsblur_backups'
|
||||||
|
|
||||||
|
# =============
|
||||||
|
# = Databases =
|
||||||
|
# =============
|
||||||
|
|
||||||
DATABASES = {
|
DATABASES = {
|
||||||
'default': {
|
'default': {
|
||||||
'NAME': 'newsblur',
|
'NAME': 'newsblur',
|
||||||
|
@ -20,28 +61,13 @@ MONGODB_SLAVE = {
|
||||||
'host': '127.0.0.1'
|
'host': '127.0.0.1'
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUG = True
|
|
||||||
|
|
||||||
MEDIA_URL = '/media/'
|
|
||||||
|
|
||||||
SECRET_KEY = 'YOUR SECRET KEY'
|
|
||||||
|
|
||||||
CACHE_BACKEND = 'dummy:///'
|
|
||||||
# CACHE_BACKEND = 'locmem:///'
|
|
||||||
# CACHE_BACKEND = 'memcached://127.0.0.1:11211'
|
|
||||||
|
|
||||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
|
||||||
|
|
||||||
# Set this to the username that is shown on the homepage to unauthenticated users.
|
|
||||||
HOMEPAGE_USERNAME = 'conesus'
|
|
||||||
|
|
||||||
# Google Reader OAuth API Keys
|
|
||||||
OAUTH_KEY = 'www.example.com'
|
|
||||||
OAUTH_SECRET = 'SECRET_KEY_FROM_GOOGLE'
|
|
||||||
|
|
||||||
# Celery RabbitMQ Broker
|
# Celery RabbitMQ Broker
|
||||||
BROKER_HOST = "127.0.0.1"
|
BROKER_HOST = "127.0.0.1"
|
||||||
|
|
||||||
|
# ===========
|
||||||
|
# = Logging =
|
||||||
|
# ===========
|
||||||
|
|
||||||
# Logging (setup for development)
|
# Logging (setup for development)
|
||||||
LOG_TO_STREAM = True
|
LOG_TO_STREAM = True
|
||||||
|
|
||||||
|
@ -51,7 +77,3 @@ if len(logging._handlerList) < 1:
|
||||||
format='%(asctime)-12s: %(message)s',
|
format='%(asctime)-12s: %(message)s',
|
||||||
datefmt='%b %d %H:%M:%S',
|
datefmt='%b %d %H:%M:%S',
|
||||||
handler=logging.StreamHandler)
|
handler=logging.StreamHandler)
|
||||||
|
|
||||||
S3_ACCESS_KEY = 'XXX'
|
|
||||||
S3_SECRET = 'SECRET'
|
|
||||||
S3_BACKUP_BUCKET = 'newsblur_backups'
|
|
||||||
|
|
18
settings.py
18
settings.py
|
@ -4,6 +4,18 @@ import os
|
||||||
from mongoengine import connect
|
from mongoengine import connect
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
|
# ===================
|
||||||
|
# = Server Settings =
|
||||||
|
# ===================
|
||||||
|
|
||||||
|
ADMINS = (
|
||||||
|
('Samuel Clay', 'samuel@ofbrooklyn.com'),
|
||||||
|
)
|
||||||
|
|
||||||
|
SERVER_EMAIL = 'server@newsblur.com'
|
||||||
|
HELLO_EMAIL = 'hello@newsblur.com'
|
||||||
|
NEWSBLUR_URL = 'http://www.newsblur.com'
|
||||||
|
|
||||||
# ===========================
|
# ===========================
|
||||||
# = Directory Declaractions =
|
# = Directory Declaractions =
|
||||||
# ===========================
|
# ===========================
|
||||||
|
@ -25,14 +37,10 @@ if '/utils' not in ' '.join(sys.path):
|
||||||
sys.path.append(UTILS_ROOT)
|
sys.path.append(UTILS_ROOT)
|
||||||
if '/vendor' not in ' '.join(sys.path):
|
if '/vendor' not in ' '.join(sys.path):
|
||||||
sys.path.append(VENDOR_ROOT)
|
sys.path.append(VENDOR_ROOT)
|
||||||
|
|
||||||
# ===================
|
# ===================
|
||||||
# = Global Settings =
|
# = Global Settings =
|
||||||
# ===================
|
# ===================
|
||||||
|
|
||||||
ADMINS = (
|
|
||||||
('Samuel Clay', 'samuel@ofbrooklyn.com'),
|
|
||||||
)
|
|
||||||
TEST_DEBUG = False
|
TEST_DEBUG = False
|
||||||
SEND_BROKEN_LINK_EMAILS = False
|
SEND_BROKEN_LINK_EMAILS = False
|
||||||
MANAGERS = ADMINS
|
MANAGERS = ADMINS
|
||||||
|
@ -305,8 +313,6 @@ SESSION_ENGINE = "django.contrib.sessions.backends.db"
|
||||||
TEST_RUNNER = "utils.testrunner.TestRunner"
|
TEST_RUNNER = "utils.testrunner.TestRunner"
|
||||||
SESSION_COOKIE_NAME = 'newsblur_sessionid'
|
SESSION_COOKIE_NAME = 'newsblur_sessionid'
|
||||||
SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years
|
SESSION_COOKIE_AGE = 60*60*24*365*2 # 2 years
|
||||||
SERVER_EMAIL = 'server@newsblur.com'
|
|
||||||
HELLO_EMAIL = 'hello@newsblur.com'
|
|
||||||
|
|
||||||
# ===========
|
# ===========
|
||||||
# = Logging =
|
# = Logging =
|
||||||
|
|
|
@ -23,7 +23,6 @@ import redis
|
||||||
# Refresh feed code adapted from Feedjack.
|
# Refresh feed code adapted from Feedjack.
|
||||||
# http://feedjack.googlecode.com
|
# http://feedjack.googlecode.com
|
||||||
|
|
||||||
URL = 'http://www.newsblur.com/'
|
|
||||||
SLOWFEED_WARNING = 10
|
SLOWFEED_WARNING = 10
|
||||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
||||||
|
@ -63,7 +62,7 @@ class FetchFeed:
|
||||||
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
|
USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)' % (
|
||||||
self.feed.num_subscribers,
|
self.feed.num_subscribers,
|
||||||
's' if self.feed.num_subscribers != 1 else '',
|
's' if self.feed.num_subscribers != 1 else '',
|
||||||
URL
|
settings.NEWSBLUR_URL
|
||||||
)
|
)
|
||||||
|
|
||||||
self.fpf = feedparser.parse(self.feed.feed_address,
|
self.fpf = feedparser.parse(self.feed.feed_address,
|
||||||
|
|
Loading…
Add table
Reference in a new issue