mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Adding URL normalization on new feeds. Also adding in collocation search backend. This is a good example of poor committing.
This commit is contained in:
parent
65e5b0dada
commit
db4776f3ea
4 changed files with 253 additions and 6 deletions
|
@ -5,7 +5,7 @@ from apps.rss_feeds.models import Feed
|
|||
from apps.reader.models import UserSubscription, UserSubscriptionFolders
|
||||
import datetime
|
||||
import lxml.etree
|
||||
from utils import json
|
||||
from utils import json, urlnorm
|
||||
import utils.opml as opml
|
||||
|
||||
class OAuthToken(models.Model):
|
||||
|
@ -54,10 +54,12 @@ class OPMLImporter(Importer):
|
|||
setattr(feed, 'htmlUrl', None)
|
||||
if not hasattr(feed, 'title'):
|
||||
setattr(feed, 'title', feed.htmlUrl)
|
||||
print '\t%s - %s - %s' % (feed.title, feed.htmlUrl, feed.xmlUrl,)
|
||||
feed_data = dict(feed_address=feed.xmlUrl, feed_link=feed.htmlUrl, feed_title=feed.title)
|
||||
feed_address = urlnorm.normalize(feed.xmlUrl)
|
||||
feed_link = urlnorm.normalize(feed.htmlUrl)
|
||||
print '\t%s - %s - %s' % (feed.title, feed_link, feed_address,)
|
||||
feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)
|
||||
# feeds.append(feed_data)
|
||||
feed_db, _ = Feed.objects.get_or_create(feed_address=feed.xmlUrl, defaults=dict(**feed_data))
|
||||
feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, defaults=dict(**feed_data))
|
||||
us, _ = UserSubscription.objects.get_or_create(
|
||||
feed=feed_db,
|
||||
user=self.user,
|
||||
|
@ -106,7 +108,10 @@ class GoogleReaderImporter(Importer):
|
|||
|
||||
if not feed_address:
|
||||
feed_address = feed_link
|
||||
|
||||
|
||||
feed_link = urlnorm.normalize(feed_link)
|
||||
feed_address = urlnorm.normalize(feed_address)
|
||||
|
||||
feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title)
|
||||
feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, defaults=dict(**feed_data))
|
||||
us, _ = UserSubscription.objects.get_or_create(
|
||||
|
|
|
@ -22,7 +22,7 @@ try:
|
|||
from apps.rss_feeds.models import Feed, Story, FeedPage
|
||||
except:
|
||||
pass
|
||||
from utils import json
|
||||
from utils import json, urlnorm
|
||||
from utils.user_functions import get_user, ajax_login_required
|
||||
from utils.feed_functions import fetch_address_from_page, format_relative_date
|
||||
|
||||
|
@ -388,6 +388,7 @@ def add_url(request):
|
|||
feed = None
|
||||
|
||||
if url:
|
||||
url = urlnorm.normalize(url)
|
||||
feed = Feed.objects.filter(Q(feed_address=url)
|
||||
| Q(feed_link__icontains=url))
|
||||
|
||||
|
|
|
@ -4,6 +4,9 @@ import difflib
|
|||
import datetime
|
||||
import hashlib
|
||||
import random
|
||||
import nltk
|
||||
import re
|
||||
from BeautifulSoup import BeautifulStoneSoup
|
||||
from django.db import models
|
||||
from django.db import IntegrityError
|
||||
from django.core.cache import cache
|
||||
|
@ -477,9 +480,31 @@ class Feed(models.Model):
|
|||
|
||||
self.save(lock=lock)
|
||||
|
||||
def calculate_collocations(self):
|
||||
trigram_measures = nltk.collocations.TrigramAssocMeasures()
|
||||
|
||||
stories = Story.objects.filter(story_feed=self)
|
||||
story_content = ' '.join([s.story_content for s in stories])
|
||||
story_content = re.sub(r'’', '\'', story_content)
|
||||
story_content = unicode(BeautifulStoneSoup(story_content,
|
||||
convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
story_content = re.sub(r'</?\w+\s+[^>]*>', '', story_content)
|
||||
story_content = re.split(r"[^A-Za-z-']+", story_content)
|
||||
|
||||
finder = nltk.collocations.TrigramCollocationFinder.from_words(story_content)
|
||||
finder.apply_freq_filter(3)
|
||||
best = finder.nbest(trigram_measures.pmi, 10)
|
||||
phrases = [' '.join(phrase) for phrase in best]
|
||||
|
||||
print phrases
|
||||
|
||||
class Meta:
|
||||
db_table="feeds"
|
||||
ordering=["feed_title"]
|
||||
|
||||
class FeedCollocations(models.Model):
|
||||
feed = models.ForeignKey(Feed)
|
||||
phrase = models.CharField(max_length=500)
|
||||
|
||||
class Tag(models.Model):
|
||||
feed = models.ForeignKey(Feed)
|
||||
|
|
216
utils/urlnorm.py
Normal file
216
utils/urlnorm.py
Normal file
|
@ -0,0 +1,216 @@
|
|||
"""
|
||||
URI Normalization function:
|
||||
* Always provide the URI scheme in lowercase characters.
|
||||
* Always provide the host, if any, in lowercase characters.
|
||||
* Only perform percent-encoding where it is essential.
|
||||
* Always use uppercase A-through-F characters when percent-encoding.
|
||||
* Prevent dot-segments appearing in non-relative URI paths.
|
||||
* For schemes that define a default authority, use an empty authority if the
|
||||
default is desired.
|
||||
* For schemes that define an empty path to be equivalent to a path of "/",
|
||||
use "/".
|
||||
* For schemes that define a port, use an empty port if the default is desired
|
||||
* All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||
|
||||
implements:
|
||||
http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
|
||||
http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
|
||||
|
||||
inspired by:
|
||||
Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
|
||||
Mark Nottingham, http://www.mnot.net/python/urlnorm.py
|
||||
"""
|
||||
|
||||
__license__ = "Python"
|
||||
|
||||
import re, unicodedata, urlparse
|
||||
from urllib import quote, unquote
|
||||
|
||||
default_port = {
|
||||
'ftp': 21,
|
||||
'telnet': 23,
|
||||
'http': 80,
|
||||
'gopher': 70,
|
||||
'news': 119,
|
||||
'nntp': 119,
|
||||
'prospero': 191,
|
||||
'https': 443,
|
||||
'snews': 563,
|
||||
'snntp': 563,
|
||||
}
|
||||
|
||||
def normalize(url):
|
||||
"""Normalize a URL."""
|
||||
|
||||
scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
|
||||
(userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups()
|
||||
|
||||
# Always provide the URI scheme in lowercase characters.
|
||||
scheme = scheme.lower()
|
||||
|
||||
# Always provide the host, if any, in lowercase characters.
|
||||
host = host.lower()
|
||||
if host and host[-1] == '.': host = host[:-1]
|
||||
|
||||
# Only perform percent-encoding where it is essential.
|
||||
# Always use uppercase A-through-F characters when percent-encoding.
|
||||
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||
def clean(string):
|
||||
string=unicode(unquote(string),'utf-8','replace')
|
||||
return unicodedata.normalize('NFC',string).encode('utf-8')
|
||||
path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
|
||||
fragment=quote(clean(fragment),"~")
|
||||
|
||||
# note care must be taken to only encode & and = characters as values
|
||||
query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
|
||||
for t in q.split("=",1)]) for q in query.split("&")])
|
||||
|
||||
# Prevent dot-segments appearing in non-relative URI paths.
|
||||
if scheme in ["","http","https","ftp","file"]:
|
||||
output=[]
|
||||
for input in path.split('/'):
|
||||
if input=="":
|
||||
if not output: output.append(input)
|
||||
elif input==".":
|
||||
pass
|
||||
elif input=="..":
|
||||
if len(output)>1: output.pop()
|
||||
else:
|
||||
output.append(input)
|
||||
if input in ["",".",".."]: output.append("")
|
||||
path='/'.join(output)
|
||||
|
||||
# For schemes that define a default authority, use an empty authority if
|
||||
# the default is desired.
|
||||
if userinfo in ["@",":@"]: userinfo=""
|
||||
|
||||
# For schemes that define an empty path to be equivalent to a path of "/",
|
||||
# use "/".
|
||||
if path=="" and scheme in ["http","https","ftp","file"]:
|
||||
path="/"
|
||||
|
||||
# For schemes that define a port, use an empty port if the default is
|
||||
# desired
|
||||
if port and scheme in default_port.keys():
|
||||
if port.isdigit():
|
||||
port=str(int(port))
|
||||
if int(port)==default_port[scheme]:
|
||||
port = ''
|
||||
|
||||
# Put it all back together again
|
||||
auth=(userinfo or "") + host
|
||||
if port: auth+=":"+port
|
||||
if url.endswith("#") and query=="" and fragment=="": path+="#"
|
||||
url = urlparse.urlunsplit((scheme,auth,path,query,fragment))
|
||||
|
||||
if '://' not in url:
|
||||
url = 'http://' + url
|
||||
if url.startswith('feed://'):
|
||||
url = url.replace('feed://', 'http://')
|
||||
|
||||
return url
|
||||
|
||||
if __name__ == "__main__":
|
||||
import unittest
|
||||
suite = unittest.TestSuite()
|
||||
|
||||
""" from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
|
||||
tests= [
|
||||
(False, "http://:@example.com/"),
|
||||
(False, "http://@example.com/"),
|
||||
(False, "http://example.com"),
|
||||
(False, "HTTP://example.com/"),
|
||||
(False, "http://EXAMPLE.COM/"),
|
||||
(False, "http://example.com/%7Ejane"),
|
||||
(False, "http://example.com/?q=%C7"),
|
||||
(False, "http://example.com/?q=%5c"),
|
||||
(False, "http://example.com/?q=C%CC%A7"),
|
||||
(False, "http://example.com/a/../a/b"),
|
||||
(False, "http://example.com/a/./b"),
|
||||
(False, "http://example.com:80/"),
|
||||
(True, "http://example.com/"),
|
||||
(True, "http://example.com/?q=%C3%87"),
|
||||
(True, "http://example.com/?q=%E2%85%A0"),
|
||||
(True, "http://example.com/?q=%5C"),
|
||||
(True, "http://example.com/~jane"),
|
||||
(True, "http://example.com/a/b"),
|
||||
(True, "http://example.com:8080/"),
|
||||
(True, "http://user:password@example.com/"),
|
||||
|
||||
# from rfc2396bis
|
||||
(True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
|
||||
(True, "http://www.ietf.org/rfc/rfc2396.txt"),
|
||||
(True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
|
||||
(True, "mailto:John.Doe@example.com"),
|
||||
(True, "news:comp.infosystems.www.servers.unix"),
|
||||
(True, "tel:+1-816-555-1212"),
|
||||
(True, "telnet://192.0.2.16:80/"),
|
||||
(True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
|
||||
|
||||
# other
|
||||
(True, "http://127.0.0.1/"),
|
||||
(False, "http://127.0.0.1:80/"),
|
||||
(True, "http://www.w3.org/2000/01/rdf-schema#"),
|
||||
(False, "http://example.com:081/"),
|
||||
]
|
||||
|
||||
def testcase(expected,value):
|
||||
class test(unittest.TestCase):
|
||||
def runTest(self):
|
||||
assert (normalize(value)==value)==expected, \
|
||||
(expected, value, normalize(value))
|
||||
return test()
|
||||
|
||||
for (expected,value) in tests:
|
||||
suite.addTest(testcase(expected,value))
|
||||
|
||||
""" mnot test suite; three tests updated for rfc2396bis. """
|
||||
tests = {
|
||||
'/foo/bar/.': '/foo/bar/',
|
||||
'/foo/bar/./': '/foo/bar/',
|
||||
'/foo/bar/..': '/foo/',
|
||||
'/foo/bar/../': '/foo/',
|
||||
'/foo/bar/../baz': '/foo/baz',
|
||||
'/foo/bar/../..': '/',
|
||||
'/foo/bar/../../': '/',
|
||||
'/foo/bar/../../baz': '/baz',
|
||||
'/foo/bar/../../../baz': '/baz', #was: '/../baz',
|
||||
'/foo/bar/../../../../baz': '/baz',
|
||||
'/./foo': '/foo',
|
||||
'/../foo': '/foo', #was: '/../foo',
|
||||
'/foo.': '/foo.',
|
||||
'/.foo': '/.foo',
|
||||
'/foo..': '/foo..',
|
||||
'/..foo': '/..foo',
|
||||
'/./../foo': '/foo', #was: '/../foo',
|
||||
'/./foo/.': '/foo/',
|
||||
'/foo/./bar': '/foo/bar',
|
||||
'/foo/../bar': '/bar',
|
||||
'/foo//': '/foo/',
|
||||
'/foo///bar//': '/foo/bar/',
|
||||
'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
|
||||
'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
|
||||
'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
|
||||
'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
|
||||
'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
|
||||
'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
|
||||
'ftp://user:pass@ftp.foo.net/foo/bar':
|
||||
'ftp://user:pass@ftp.foo.net/foo/bar',
|
||||
'http://USER:pass@www.Example.COM/foo/bar':
|
||||
'http://USER:pass@www.example.com/foo/bar',
|
||||
'http://www.example.com./': 'http://www.example.com/',
|
||||
'-': '-',
|
||||
}
|
||||
|
||||
def testcase(original,normalized):
|
||||
class test(unittest.TestCase):
|
||||
def runTest(self):
|
||||
assert normalize(original)==normalized, \
|
||||
(original, normalized, normalize(original))
|
||||
return test()
|
||||
|
||||
for (original,normalized) in tests.items():
|
||||
suite.addTest(testcase(original,normalized))
|
||||
|
||||
""" execute tests """
|
||||
unittest.TextTestRunner().run(suite)
|
Loading…
Add table
Reference in a new issue