mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Unicode decode error in url normalization.
This commit is contained in:
parent
db4776f3ea
commit
5e42e3db9c
2 changed files with 7 additions and 7 deletions
|
@ -7,6 +7,7 @@ import random
|
|||
import nltk
|
||||
import re
|
||||
from BeautifulSoup import BeautifulStoneSoup
|
||||
from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
|
||||
from django.db import models
|
||||
from django.db import IntegrityError
|
||||
from django.core.cache import cache
|
||||
|
@ -480,9 +481,8 @@ class Feed(models.Model):
|
|||
|
||||
self.save(lock=lock)
|
||||
|
||||
def calculate_collocations(self):
|
||||
trigram_measures = nltk.collocations.TrigramAssocMeasures()
|
||||
|
||||
def calculate_collocations(self, collocation_measures=TrigramAssocMeasures,
|
||||
collocation_finder=TrigramCollocationFinder):
|
||||
stories = Story.objects.filter(story_feed=self)
|
||||
story_content = ' '.join([s.story_content for s in stories])
|
||||
story_content = re.sub(r'’', '\'', story_content)
|
||||
|
@ -491,12 +491,12 @@ class Feed(models.Model):
|
|||
story_content = re.sub(r'</?\w+\s+[^>]*>', '', story_content)
|
||||
story_content = re.split(r"[^A-Za-z-']+", story_content)
|
||||
|
||||
finder = nltk.collocations.TrigramCollocationFinder.from_words(story_content)
|
||||
finder = collocation_finder.from_words(story_content)
|
||||
finder.apply_freq_filter(3)
|
||||
best = finder.nbest(trigram_measures.pmi, 10)
|
||||
best = finder.nbest(collocation_measures.pmi, 10)
|
||||
phrases = [' '.join(phrase) for phrase in best]
|
||||
|
||||
print phrases
|
||||
return phrases
|
||||
|
||||
class Meta:
|
||||
db_table="feeds"
|
||||
|
|
|
@ -56,7 +56,7 @@ def normalize(url):
|
|||
# Always use uppercase A-through-F characters when percent-encoding.
|
||||
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||
def clean(string):
|
||||
string=unicode(unquote(string),'utf-8','replace')
|
||||
string=unicode(unquote(string))
|
||||
return unicodedata.normalize('NFC',string).encode('utf-8')
|
||||
path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
|
||||
fragment=quote(clean(fragment),"~")
|
||||
|
|
Loading…
Add table
Reference in a new issue