Unicode decode error in url normalization.

This commit is contained in:
Samuel Clay 2010-07-27 22:27:32 -04:00
parent db4776f3ea
commit 5e42e3db9c
2 changed files with 7 additions and 7 deletions

View file

@ -7,6 +7,7 @@ import random
import nltk
import re
from BeautifulSoup import BeautifulStoneSoup
from nltk.collocations import TrigramCollocationFinder, BigramCollocationFinder, TrigramAssocMeasures, BigramAssocMeasures
from django.db import models
from django.db import IntegrityError
from django.core.cache import cache
@ -480,9 +481,8 @@ class Feed(models.Model):
self.save(lock=lock)
def calculate_collocations(self):
trigram_measures = nltk.collocations.TrigramAssocMeasures()
def calculate_collocations(self, collocation_measures=TrigramAssocMeasures,
collocation_finder=TrigramCollocationFinder):
stories = Story.objects.filter(story_feed=self)
story_content = ' '.join([s.story_content for s in stories])
story_content = re.sub(r'’', '\'', story_content)
@ -491,12 +491,12 @@ class Feed(models.Model):
story_content = re.sub(r'</?\w+\s+[^>]*>', '', story_content)
story_content = re.split(r"[^A-Za-z-']+", story_content)
finder = nltk.collocations.TrigramCollocationFinder.from_words(story_content)
finder = collocation_finder.from_words(story_content)
finder.apply_freq_filter(3)
best = finder.nbest(trigram_measures.pmi, 10)
best = finder.nbest(collocation_measures.pmi, 10)
phrases = [' '.join(phrase) for phrase in best]
print phrases
return phrases
class Meta:
db_table="feeds"

View file

@ -56,7 +56,7 @@ def normalize(url):
# Always use uppercase A-through-F characters when percent-encoding.
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
def clean(string):
string=unicode(unquote(string),'utf-8','replace')
string=unicode(unquote(string))
return unicodedata.normalize('NFC',string).encode('utf-8')
path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
fragment=quote(clean(fragment),"~")