mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Adding new autocomplete. Let's hope this doesn't destroy performance in Redis.
This commit is contained in:
parent
cb90344cda
commit
c25c4478e6
7 changed files with 1138 additions and 22 deletions
|
@ -8,6 +8,7 @@ import mongoengine as mongo
|
|||
import zlib
|
||||
import hashlib
|
||||
import redis
|
||||
from urlparse import urlparse
|
||||
from utils.feed_functions import Counter
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
|
@ -36,6 +37,7 @@ from utils.feed_functions import timelimit, TimeoutError
|
|||
from utils.feed_functions import relative_timesince
|
||||
from utils.feed_functions import seconds_timesince
|
||||
from utils.story_functions import strip_tags, htmldiff, strip_comments
|
||||
from vendor.redis_completion.engine import RedisEngine
|
||||
|
||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||
|
||||
|
@ -191,7 +193,6 @@ class Feed(models.Model):
|
|||
|
||||
try:
|
||||
super(Feed, self).save(*args, **kwargs)
|
||||
return self
|
||||
except IntegrityError, e:
|
||||
logging.debug(" ---> ~FRFeed save collision (%s), checking dupe..." % e)
|
||||
duplicate_feeds = Feed.objects.filter(feed_address=self.feed_address,
|
||||
|
@ -209,8 +210,10 @@ class Feed(models.Model):
|
|||
logging.debug(" ---> ~FRFound different feed (%s), merging..." % duplicate_feeds[0])
|
||||
feed = Feed.get_by_id(merge_feeds(duplicate_feeds[0].pk, self.pk))
|
||||
return feed
|
||||
|
||||
return self
|
||||
|
||||
self.sync_autocompletion()
|
||||
|
||||
return self
|
||||
|
||||
def index_for_search(self):
|
||||
if self.num_subscribers > 1 and not self.branch_from_feed:
|
||||
|
@ -223,6 +226,31 @@ class Feed(models.Model):
|
|||
|
||||
def sync_redis(self):
|
||||
return MStory.sync_all_redis(self.pk)
|
||||
|
||||
def sync_autocompletion(self):
|
||||
if self.num_subscribers <= 1: return
|
||||
if self.branch_from_feed: return
|
||||
if any(t in self.feed_address for t in ['token', 'private']): return
|
||||
|
||||
engine = RedisEngine(prefix="FT", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
|
||||
engine.store(self.pk, title=self.feed_title)
|
||||
engine.boost(self.pk, self.num_subscribers)
|
||||
|
||||
parts = urlparse(self.feed_address)
|
||||
engine = RedisEngine(prefix="FA", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
|
||||
engine.store(self.pk, title=parts.hostname)
|
||||
engine.boost(self.pk, self.num_subscribers)
|
||||
|
||||
@classmethod
|
||||
def autocomplete(self, prefix, limit=5):
|
||||
engine = RedisEngine(prefix="FA", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
|
||||
results = engine.search(phrase=prefix, limit=limit, autoboost=True)
|
||||
|
||||
if len(results) < limit:
|
||||
engine = RedisEngine(prefix="FT", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
|
||||
results += engine.search(phrase=prefix, limit=limit-len(results), autoboost=True, filters=[lambda f: f not in results])
|
||||
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def find_or_create(cls, feed_address, feed_link, *args, **kwargs):
|
||||
|
|
|
@ -74,29 +74,14 @@ def feed_autocomplete(request):
|
|||
query = request.GET.get('term')
|
||||
version = int(request.GET.get('v', 1))
|
||||
|
||||
if True or not user.profile.is_premium:
|
||||
return dict(code=-1, message="Overloaded, no autocomplete results.", feeds=[], term=query)
|
||||
# if True or not user.profile.is_premium:
|
||||
# return dict(code=-1, message="Overloaded, no autocomplete results.", feeds=[], term=query)
|
||||
|
||||
if not query:
|
||||
return dict(code=-1, message="Specify a search 'term'.", feeds=[], term=query)
|
||||
|
||||
feeds = []
|
||||
for field in ['feed_address', 'feed_title', 'feed_link']:
|
||||
if not feeds:
|
||||
feeds = Feed.objects.filter(**{
|
||||
'%s__icontains' % field: query,
|
||||
'num_subscribers__gt': 1,
|
||||
'branch_from_feed__isnull': True,
|
||||
}).exclude(
|
||||
Q(**{'%s__icontains' % field: 'token'}) |
|
||||
Q(**{'%s__icontains' % field: 'private'})
|
||||
).only(
|
||||
'id',
|
||||
'feed_title',
|
||||
'feed_address',
|
||||
'num_subscribers'
|
||||
).select_related("data").order_by('-num_subscribers')[:5]
|
||||
|
||||
feed_ids = Feed.autocomplete(query)
|
||||
feeds = [Feed.get_by_id(feed_id) for feed_id in feed_ids]
|
||||
feeds = [{
|
||||
'id': feed.pk,
|
||||
'value': feed.feed_address,
|
||||
|
@ -104,6 +89,7 @@ def feed_autocomplete(request):
|
|||
'tagline': feed.data and feed.data.feed_tagline,
|
||||
'num_subscribers': feed.num_subscribers,
|
||||
} for feed in feeds]
|
||||
feeds = sorted(feeds, key=lambda f: -1 * f['num_subscribers'])
|
||||
|
||||
feed_ids = [f['id'] for f in feeds]
|
||||
feed_icons = dict((icon.feed_id, icon) for icon in MFeedIcon.objects.filter(feed_id__in=feed_ids))
|
||||
|
|
|
@ -549,6 +549,8 @@ REDIS_ANALYTICS_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=2)
|
|||
REDIS_STATISTICS_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=3)
|
||||
REDIS_FEED_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=4)
|
||||
REDIS_SESSION_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=5)
|
||||
# DB 6 = Session Store
|
||||
REDIS_AUTOCOMPLETE_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=7)
|
||||
|
||||
JAMMIT = jammit.JammitAssets(NEWSBLUR_DIR)
|
||||
|
||||
|
|
1
vendor/redis_completion/__init__.py
vendored
Executable file
1
vendor/redis_completion/__init__.py
vendored
Executable file
|
@ -0,0 +1 @@
|
|||
from redis_completion.engine import RedisEngine
|
228
vendor/redis_completion/engine.py
vendored
Executable file
228
vendor/redis_completion/engine.py
vendored
Executable file
|
@ -0,0 +1,228 @@
|
|||
try:
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
import json
|
||||
import re
|
||||
from redis import Redis
|
||||
|
||||
from redis_completion.stop_words import STOP_WORDS as _STOP_WORDS
|
||||
|
||||
|
||||
# aggressive stop words will be better when the length of the document is longer
|
||||
AGGRESSIVE_STOP_WORDS = _STOP_WORDS
|
||||
|
||||
# default stop words should work fine for titles and things like that
|
||||
DEFAULT_STOP_WORDS = set(['a', 'an', 'of', 'the'])
|
||||
|
||||
|
||||
class RedisEngine(object):
|
||||
"""
|
||||
References
|
||||
----------
|
||||
|
||||
http://antirez.com/post/autocomplete-with-redis.html
|
||||
http://stackoverflow.com/questions/1958005/redis-autocomplete/1966188#1966188
|
||||
http://patshaughnessy.net/2011/11/29/two-ways-of-using-redis-to-build-a-nosql-autocomplete-search-index
|
||||
"""
|
||||
def __init__(self, prefix='ac', stop_words=None, cache_timeout=300, **conn_kwargs):
|
||||
self.prefix = prefix
|
||||
self.stop_words = (stop_words is None) and DEFAULT_STOP_WORDS or stop_words
|
||||
|
||||
self.conn_kwargs = conn_kwargs
|
||||
self.client = self.get_client()
|
||||
|
||||
self.cache_timeout = cache_timeout
|
||||
|
||||
self.boost_key = '%s:b' % self.prefix
|
||||
self.data_key = '%s:d' % self.prefix
|
||||
self.title_key = '%s:t' % self.prefix
|
||||
self.search_key = lambda k: '%s:s:%s' % (self.prefix, k)
|
||||
self.cache_key = lambda pk, bk: '%s:c:%s:%s' % (self.prefix, pk, bk)
|
||||
|
||||
self.kcombine = lambda _id, _type: str(_id)
|
||||
self.ksplit = lambda k: k
|
||||
|
||||
def get_client(self):
|
||||
return Redis(**self.conn_kwargs)
|
||||
|
||||
def score_key(self, k, max_size=20):
|
||||
k_len = len(k)
|
||||
a = ord('a') - 2
|
||||
score = 0
|
||||
|
||||
for i in range(max_size):
|
||||
if i < k_len:
|
||||
c = (ord(k[i]) - a)
|
||||
if c < 2 or c > 27:
|
||||
c = 1
|
||||
else:
|
||||
c = 1
|
||||
score += c*(27**(max_size-i))
|
||||
return score
|
||||
|
||||
def clean_phrase(self, phrase):
|
||||
phrase = re.sub('[^a-z0-9_\-\s]', '', phrase.lower())
|
||||
return [w for w in phrase.split() if w not in self.stop_words]
|
||||
|
||||
def create_key(self, phrase):
|
||||
return ' '.join(self.clean_phrase(phrase))
|
||||
|
||||
def autocomplete_keys(self, w):
|
||||
for i in range(1, len(w)):
|
||||
yield w[:i]
|
||||
yield w
|
||||
|
||||
def flush(self, everything=False, batch_size=1000):
|
||||
if everything:
|
||||
return self.client.flushdb()
|
||||
|
||||
# this could be expensive :-(
|
||||
keys = self.client.keys('%s:*' % self.prefix)
|
||||
|
||||
# batch keys
|
||||
for i in range(0, len(keys), batch_size):
|
||||
self.client.delete(*keys[i:i+batch_size])
|
||||
|
||||
def store(self, obj_id, title=None, data=None, obj_type=None, check_exist=True):
|
||||
if title is None:
|
||||
title = obj_id
|
||||
if data is None:
|
||||
data = title
|
||||
|
||||
title_score = self.score_key(self.create_key(title))
|
||||
|
||||
combined_id = self.kcombine(obj_id, obj_type or '')
|
||||
|
||||
if check_exist and self.exists(obj_id, obj_type):
|
||||
stored_title = self.client.hget(self.title_key, combined_id)
|
||||
|
||||
# if the stored title is the same, we can simply update the data key
|
||||
# since everything else will have stayed the same
|
||||
if stored_title == title:
|
||||
self.client.hset(self.data_key, combined_id, data)
|
||||
return
|
||||
else:
|
||||
self.remove(obj_id, obj_type)
|
||||
|
||||
pipe = self.client.pipeline()
|
||||
pipe.hset(self.data_key, combined_id, data)
|
||||
pipe.hset(self.title_key, combined_id, title)
|
||||
|
||||
for word in self.clean_phrase(title):
|
||||
for partial_key in self.autocomplete_keys(word):
|
||||
pipe.zadd(self.search_key(partial_key), combined_id, title_score)
|
||||
|
||||
pipe.execute()
|
||||
|
||||
def store_json(self, obj_id, title, data_dict, obj_type=None):
|
||||
return self.store(obj_id, title, json.dumps(data_dict), obj_type)
|
||||
|
||||
def remove(self, obj_id, obj_type=None):
|
||||
obj_id = self.kcombine(obj_id, obj_type or '')
|
||||
title = self.client.hget(self.title_key, obj_id) or ''
|
||||
keys = []
|
||||
|
||||
for word in self.clean_phrase(title):
|
||||
for partial_key in self.autocomplete_keys(word):
|
||||
key = self.search_key(partial_key)
|
||||
if not self.client.zrange(key, 1, 2):
|
||||
self.client.delete(key)
|
||||
else:
|
||||
self.client.zrem(key, obj_id)
|
||||
|
||||
self.client.hdel(self.data_key, obj_id)
|
||||
self.client.hdel(self.title_key, obj_id)
|
||||
self.client.hdel(self.boost_key, obj_id)
|
||||
|
||||
def boost(self, obj_id, multiplier=1.1, negative=False):
|
||||
# take the existing boost for this item and increase it by the multiplier
|
||||
current = self.client.hget(self.boost_key, obj_id)
|
||||
current_f = float(current or 1.0)
|
||||
if negative:
|
||||
multiplier = 1 / multiplier
|
||||
self.client.hset(self.boost_key, obj_id, current_f * multiplier)
|
||||
|
||||
def exists(self, obj_id, obj_type=None):
|
||||
obj_id = self.kcombine(obj_id, obj_type or '')
|
||||
return self.client.hexists(self.data_key, obj_id)
|
||||
|
||||
def get_cache_key(self, phrases, boosts):
|
||||
if boosts:
|
||||
boost_key = '|'.join('%s:%s' % (k, v) for k, v in sorted(boosts.items()))
|
||||
else:
|
||||
boost_key = ''
|
||||
phrase_key = '|'.join(phrases)
|
||||
return self.cache_key(phrase_key, boost_key)
|
||||
|
||||
def _process_ids(self, id_list, limit, filters, mappers):
|
||||
ct = 0
|
||||
data = []
|
||||
|
||||
for raw_id in id_list:
|
||||
# raw_data = self.client.hget(self.data_key, raw_id)
|
||||
raw_data = raw_id
|
||||
if not raw_data:
|
||||
continue
|
||||
|
||||
if mappers:
|
||||
for m in mappers:
|
||||
raw_data = m(raw_data)
|
||||
|
||||
if filters:
|
||||
passes = True
|
||||
for f in filters:
|
||||
if not f(raw_data):
|
||||
passes = False
|
||||
break
|
||||
|
||||
if not passes:
|
||||
continue
|
||||
|
||||
data.append(raw_data)
|
||||
ct += 1
|
||||
if limit and ct == limit:
|
||||
break
|
||||
|
||||
return data
|
||||
|
||||
def search(self, phrase, limit=None, filters=None, mappers=None, boosts=None, autoboost=False):
|
||||
cleaned = self.clean_phrase(phrase)
|
||||
if not cleaned:
|
||||
return []
|
||||
|
||||
if autoboost:
|
||||
boosts = boosts or {}
|
||||
stored = self.client.hgetall(self.boost_key)
|
||||
for obj_id in stored:
|
||||
if obj_id not in boosts:
|
||||
boosts[obj_id] = float(stored[obj_id])
|
||||
|
||||
if len(cleaned) == 1 and not boosts:
|
||||
new_key = self.search_key(cleaned[0])
|
||||
else:
|
||||
new_key = self.get_cache_key(cleaned, boosts)
|
||||
if not self.client.exists(new_key):
|
||||
# zinterstore also takes {k1: wt1, k2: wt2}
|
||||
self.client.zinterstore(new_key, map(self.search_key, cleaned))
|
||||
self.client.expire(new_key, self.cache_timeout)
|
||||
|
||||
if boosts:
|
||||
pipe = self.client.pipeline()
|
||||
for raw_id, score in self.client.zrange(new_key, 0, -1, withscores=True):
|
||||
orig_score = score
|
||||
for part in self.ksplit(raw_id):
|
||||
if part and part in boosts:
|
||||
score *= 1 / boosts[part]
|
||||
if orig_score != score:
|
||||
pipe.zadd(new_key, raw_id, score)
|
||||
pipe.execute()
|
||||
|
||||
id_list = self.client.zrange(new_key, 0, -1)
|
||||
# return id_list
|
||||
return self._process_ids(id_list, limit, filters, mappers)
|
||||
|
||||
def search_json(self, phrase, limit=None, filters=None, mappers=None, boosts=None, autoboost=False):
|
||||
if not mappers:
|
||||
mappers = []
|
||||
mappers.insert(0, json.loads)
|
||||
return self.search(phrase, limit, filters, mappers, boosts, autoboost)
|
594
vendor/redis_completion/stop_words.py
vendored
Executable file
594
vendor/redis_completion/stop_words.py
vendored
Executable file
|
@ -0,0 +1,594 @@
|
|||
words = """a
|
||||
a's
|
||||
able
|
||||
about
|
||||
above
|
||||
according
|
||||
accordingly
|
||||
across
|
||||
actually
|
||||
after
|
||||
afterwards
|
||||
again
|
||||
against
|
||||
ain't
|
||||
all
|
||||
allow
|
||||
allows
|
||||
almost
|
||||
alone
|
||||
along
|
||||
already
|
||||
also
|
||||
although
|
||||
always
|
||||
am
|
||||
among
|
||||
amongst
|
||||
amoungst
|
||||
amount
|
||||
an
|
||||
and
|
||||
another
|
||||
any
|
||||
anybody
|
||||
anyhow
|
||||
anyone
|
||||
anything
|
||||
anyway
|
||||
anyways
|
||||
anywhere
|
||||
apart
|
||||
appear
|
||||
appreciate
|
||||
appropriate
|
||||
are
|
||||
aren't
|
||||
around
|
||||
as
|
||||
aside
|
||||
ask
|
||||
asking
|
||||
associated
|
||||
at
|
||||
available
|
||||
away
|
||||
awfully
|
||||
back
|
||||
be
|
||||
became
|
||||
because
|
||||
become
|
||||
becomes
|
||||
becoming
|
||||
been
|
||||
before
|
||||
beforehand
|
||||
behind
|
||||
being
|
||||
believe
|
||||
below
|
||||
beside
|
||||
besides
|
||||
best
|
||||
better
|
||||
between
|
||||
beyond
|
||||
bill
|
||||
both
|
||||
bottom
|
||||
brief
|
||||
but
|
||||
by
|
||||
c'mon
|
||||
c's
|
||||
call
|
||||
came
|
||||
can
|
||||
can't
|
||||
cannot
|
||||
cant
|
||||
cause
|
||||
causes
|
||||
certain
|
||||
certainly
|
||||
changes
|
||||
clearly
|
||||
co
|
||||
com
|
||||
come
|
||||
comes
|
||||
computer
|
||||
con
|
||||
concerning
|
||||
consequently
|
||||
consider
|
||||
considering
|
||||
contain
|
||||
containing
|
||||
contains
|
||||
corresponding
|
||||
could
|
||||
couldn't
|
||||
couldnt
|
||||
course
|
||||
cry
|
||||
currently
|
||||
de
|
||||
definitely
|
||||
describe
|
||||
described
|
||||
despite
|
||||
detail
|
||||
did
|
||||
didn't
|
||||
different
|
||||
do
|
||||
does
|
||||
doesn't
|
||||
doing
|
||||
don't
|
||||
done
|
||||
down
|
||||
downwards
|
||||
due
|
||||
during
|
||||
each
|
||||
edu
|
||||
eg
|
||||
eight
|
||||
either
|
||||
eleven
|
||||
else
|
||||
elsewhere
|
||||
empty
|
||||
enough
|
||||
entirely
|
||||
especially
|
||||
et
|
||||
etc
|
||||
even
|
||||
ever
|
||||
every
|
||||
everybody
|
||||
everyone
|
||||
everything
|
||||
everywhere
|
||||
ex
|
||||
exactly
|
||||
example
|
||||
except
|
||||
far
|
||||
few
|
||||
fifteen
|
||||
fifth
|
||||
fify
|
||||
fill
|
||||
find
|
||||
fire
|
||||
first
|
||||
five
|
||||
followed
|
||||
following
|
||||
follows
|
||||
for
|
||||
former
|
||||
formerly
|
||||
forth
|
||||
forty
|
||||
found
|
||||
four
|
||||
from
|
||||
front
|
||||
full
|
||||
further
|
||||
furthermore
|
||||
get
|
||||
gets
|
||||
getting
|
||||
give
|
||||
given
|
||||
gives
|
||||
go
|
||||
goes
|
||||
going
|
||||
gone
|
||||
got
|
||||
gotten
|
||||
greetings
|
||||
had
|
||||
hadn't
|
||||
happens
|
||||
hardly
|
||||
has
|
||||
hasn't
|
||||
hasnt
|
||||
have
|
||||
haven't
|
||||
having
|
||||
he
|
||||
he's
|
||||
hello
|
||||
help
|
||||
hence
|
||||
her
|
||||
here
|
||||
here's
|
||||
hereafter
|
||||
hereby
|
||||
herein
|
||||
hereupon
|
||||
hers
|
||||
herself
|
||||
hi
|
||||
him
|
||||
himself
|
||||
his
|
||||
hither
|
||||
hopefully
|
||||
how
|
||||
howbeit
|
||||
however
|
||||
hundred
|
||||
i
|
||||
i'd
|
||||
i'll
|
||||
i'm
|
||||
i've
|
||||
ie
|
||||
if
|
||||
ignored
|
||||
immediate
|
||||
in
|
||||
inasmuch
|
||||
inc
|
||||
indeed
|
||||
indicate
|
||||
indicated
|
||||
indicates
|
||||
inner
|
||||
insofar
|
||||
instead
|
||||
interest
|
||||
into
|
||||
inward
|
||||
is
|
||||
isn't
|
||||
it
|
||||
it'd
|
||||
it'll
|
||||
it's
|
||||
its
|
||||
itself
|
||||
just
|
||||
keep
|
||||
keeps
|
||||
kept
|
||||
know
|
||||
known
|
||||
knows
|
||||
last
|
||||
lately
|
||||
later
|
||||
latter
|
||||
latterly
|
||||
least
|
||||
less
|
||||
lest
|
||||
let
|
||||
let's
|
||||
like
|
||||
liked
|
||||
likely
|
||||
little
|
||||
look
|
||||
looking
|
||||
looks
|
||||
ltd
|
||||
made
|
||||
mainly
|
||||
many
|
||||
may
|
||||
maybe
|
||||
me
|
||||
mean
|
||||
meanwhile
|
||||
merely
|
||||
might
|
||||
mill
|
||||
mine
|
||||
more
|
||||
moreover
|
||||
most
|
||||
mostly
|
||||
move
|
||||
much
|
||||
must
|
||||
my
|
||||
myself
|
||||
name
|
||||
namely
|
||||
nd
|
||||
near
|
||||
nearly
|
||||
necessary
|
||||
need
|
||||
needs
|
||||
neither
|
||||
never
|
||||
nevertheless
|
||||
new
|
||||
next
|
||||
nine
|
||||
no
|
||||
nobody
|
||||
non
|
||||
none
|
||||
noone
|
||||
nor
|
||||
normally
|
||||
not
|
||||
nothing
|
||||
novel
|
||||
now
|
||||
nowhere
|
||||
obviously
|
||||
of
|
||||
off
|
||||
often
|
||||
oh
|
||||
ok
|
||||
okay
|
||||
old
|
||||
on
|
||||
once
|
||||
one
|
||||
ones
|
||||
only
|
||||
onto
|
||||
or
|
||||
other
|
||||
others
|
||||
otherwise
|
||||
ought
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
out
|
||||
outside
|
||||
over
|
||||
overall
|
||||
own
|
||||
part
|
||||
particular
|
||||
particularly
|
||||
per
|
||||
perhaps
|
||||
placed
|
||||
please
|
||||
plus
|
||||
possible
|
||||
presumably
|
||||
probably
|
||||
provides
|
||||
put
|
||||
que
|
||||
quite
|
||||
qv
|
||||
rather
|
||||
rd
|
||||
re
|
||||
really
|
||||
reasonably
|
||||
regarding
|
||||
regardless
|
||||
regards
|
||||
relatively
|
||||
respectively
|
||||
right
|
||||
said
|
||||
same
|
||||
saw
|
||||
say
|
||||
saying
|
||||
says
|
||||
second
|
||||
secondly
|
||||
see
|
||||
seeing
|
||||
seem
|
||||
seemed
|
||||
seeming
|
||||
seems
|
||||
seen
|
||||
self
|
||||
selves
|
||||
sensible
|
||||
sent
|
||||
serious
|
||||
seriously
|
||||
seven
|
||||
several
|
||||
shall
|
||||
she
|
||||
should
|
||||
shouldn't
|
||||
show
|
||||
side
|
||||
since
|
||||
sincere
|
||||
six
|
||||
sixty
|
||||
so
|
||||
some
|
||||
somebody
|
||||
somehow
|
||||
someone
|
||||
something
|
||||
sometime
|
||||
sometimes
|
||||
somewhat
|
||||
somewhere
|
||||
soon
|
||||
sorry
|
||||
specified
|
||||
specify
|
||||
specifying
|
||||
still
|
||||
sub
|
||||
such
|
||||
sup
|
||||
sure
|
||||
system
|
||||
t's
|
||||
take
|
||||
taken
|
||||
tell
|
||||
ten
|
||||
tends
|
||||
th
|
||||
than
|
||||
thank
|
||||
thanks
|
||||
thanx
|
||||
that
|
||||
that's
|
||||
thats
|
||||
the
|
||||
their
|
||||
theirs
|
||||
them
|
||||
themselves
|
||||
then
|
||||
thence
|
||||
there
|
||||
there's
|
||||
thereafter
|
||||
thereby
|
||||
therefore
|
||||
therein
|
||||
theres
|
||||
thereupon
|
||||
these
|
||||
they
|
||||
they'd
|
||||
they'll
|
||||
they're
|
||||
they've
|
||||
thick
|
||||
thin
|
||||
think
|
||||
third
|
||||
this
|
||||
thorough
|
||||
thoroughly
|
||||
those
|
||||
though
|
||||
three
|
||||
through
|
||||
throughout
|
||||
thru
|
||||
thus
|
||||
to
|
||||
together
|
||||
too
|
||||
took
|
||||
top
|
||||
toward
|
||||
towards
|
||||
tried
|
||||
tries
|
||||
truly
|
||||
try
|
||||
trying
|
||||
twelve
|
||||
twenty
|
||||
twice
|
||||
two
|
||||
un
|
||||
under
|
||||
unfortunately
|
||||
unless
|
||||
unlikely
|
||||
until
|
||||
unto
|
||||
up
|
||||
upon
|
||||
us
|
||||
use
|
||||
used
|
||||
useful
|
||||
uses
|
||||
using
|
||||
usually
|
||||
value
|
||||
various
|
||||
very
|
||||
via
|
||||
viz
|
||||
vs
|
||||
want
|
||||
wants
|
||||
was
|
||||
wasn't
|
||||
way
|
||||
we
|
||||
we'd
|
||||
we'll
|
||||
we're
|
||||
we've
|
||||
welcome
|
||||
well
|
||||
went
|
||||
were
|
||||
weren't
|
||||
what
|
||||
what's
|
||||
whatever
|
||||
when
|
||||
whence
|
||||
whenever
|
||||
where
|
||||
where's
|
||||
whereafter
|
||||
whereas
|
||||
whereby
|
||||
wherein
|
||||
whereupon
|
||||
wherever
|
||||
whether
|
||||
which
|
||||
while
|
||||
whither
|
||||
who
|
||||
who's
|
||||
whoever
|
||||
whole
|
||||
whom
|
||||
whose
|
||||
why
|
||||
will
|
||||
willing
|
||||
wish
|
||||
with
|
||||
within
|
||||
without
|
||||
won't
|
||||
wonder
|
||||
would
|
||||
wouldn't
|
||||
yes
|
||||
yet
|
||||
you
|
||||
you'd
|
||||
you'll
|
||||
you're
|
||||
you've
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
zero"""
|
||||
STOP_WORDS = set([
|
||||
w.strip() for w in words.splitlines() if w
|
||||
])
|
277
vendor/redis_completion/tests.py
vendored
Executable file
277
vendor/redis_completion/tests.py
vendored
Executable file
|
@ -0,0 +1,277 @@
|
|||
import random
|
||||
from unittest import TestCase
|
||||
|
||||
from redis_completion.engine import RedisEngine
|
||||
|
||||
|
||||
stop_words = set(['a', 'an', 'the', 'of'])
|
||||
|
||||
class RedisCompletionTestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.engine = self.get_engine()
|
||||
self.engine.flush()
|
||||
|
||||
def get_engine(self):
|
||||
return RedisEngine(prefix='testac', db=15)
|
||||
|
||||
def store_data(self, id=None):
|
||||
test_data = (
|
||||
(1, 'testing python'),
|
||||
(2, 'testing python code'),
|
||||
(3, 'web testing python code'),
|
||||
(4, 'unit tests with python'),
|
||||
)
|
||||
for obj_id, title in test_data:
|
||||
if id is None or id == obj_id:
|
||||
self.engine.store_json(obj_id, title, {
|
||||
'obj_id': obj_id,
|
||||
'title': title,
|
||||
'secret': obj_id % 2 == 0 and 'derp' or 'herp',
|
||||
})
|
||||
|
||||
def sort_results(self, r):
|
||||
return sorted(r, key=lambda i:i['obj_id'])
|
||||
|
||||
def test_search(self):
|
||||
self.store_data()
|
||||
|
||||
results = self.engine.search_json('testing python')
|
||||
self.assertEqual(self.sort_results(results), [
|
||||
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
|
||||
{'obj_id': 2, 'title': 'testing python code', 'secret': 'derp'},
|
||||
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
|
||||
])
|
||||
|
||||
results = self.engine.search_json('test')
|
||||
self.assertEqual(self.sort_results(results), [
|
||||
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
|
||||
{'obj_id': 2, 'title': 'testing python code', 'secret': 'derp'},
|
||||
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
|
||||
{'obj_id': 4, 'title': 'unit tests with python', 'secret': 'derp'},
|
||||
])
|
||||
|
||||
results = self.engine.search_json('unit')
|
||||
self.assertEqual(results, [
|
||||
{'obj_id': 4, 'title': 'unit tests with python', 'secret': 'derp'},
|
||||
])
|
||||
|
||||
results = self.engine.search_json('')
|
||||
self.assertEqual(results, [])
|
||||
|
||||
results = self.engine.search_json('missing')
|
||||
self.assertEqual(results, [])
|
||||
|
||||
def test_boosting(self):
|
||||
test_data = (
|
||||
(1, 'test alpha', 't1'),
|
||||
(2, 'test beta', 't1'),
|
||||
(3, 'test gamma', 't1'),
|
||||
(4, 'test delta', 't1'),
|
||||
(5, 'test alpha', 't2'),
|
||||
(6, 'test beta', 't2'),
|
||||
(7, 'test gamma', 't2'),
|
||||
(8, 'test delta', 't2'),
|
||||
(9, 'test alpha', 't3'),
|
||||
(10, 'test beta', 't3'),
|
||||
(11, 'test gamma', 't3'),
|
||||
(12, 'test delta', 't3'),
|
||||
)
|
||||
for obj_id, title, obj_type in test_data:
|
||||
self.engine.store_json(obj_id, title, {
|
||||
'obj_id': obj_id,
|
||||
'title': title,
|
||||
}, obj_type)
|
||||
|
||||
def assertExpected(results, id_list):
|
||||
self.assertEqual([r['obj_id'] for r in results], id_list)
|
||||
|
||||
results = self.engine.search_json('alp')
|
||||
assertExpected(results, [1, 5, 9])
|
||||
|
||||
results = self.engine.search_json('alp', boosts={'t2': 1.1})
|
||||
assertExpected(results, [5, 1, 9])
|
||||
|
||||
results = self.engine.search_json('test', boosts={'t3': 1.5, 't2': 1.1})
|
||||
assertExpected(results, [9, 10, 12, 11, 5, 6, 8, 7, 1, 2, 4, 3])
|
||||
|
||||
results = self.engine.search_json('alp', boosts={'t1': 0.5})
|
||||
assertExpected(results, [5, 9, 1])
|
||||
|
||||
results = self.engine.search_json('alp', boosts={'t1': 1.5, 't3': 1.6})
|
||||
assertExpected(results, [9, 1, 5])
|
||||
|
||||
results = self.engine.search_json('alp', boosts={'t3': 1.5, '5': 1.6})
|
||||
assertExpected(results, [5, 9, 1])
|
||||
|
||||
def test_autoboost(self):
|
||||
self.engine.store('t1', 'testing 1')
|
||||
self.engine.store('t2', 'testing 2')
|
||||
self.engine.store('t3', 'testing 3')
|
||||
self.engine.store('t4', 'testing 4')
|
||||
self.engine.store('t5', 'testing 5')
|
||||
|
||||
def assertExpected(results, id_list):
|
||||
self.assertEqual(results, ['testing %s' % i for i in id_list])
|
||||
|
||||
results = self.engine.search('testing', autoboost=True)
|
||||
assertExpected(results, [1, 2, 3, 4, 5])
|
||||
|
||||
self.engine.boost('t3')
|
||||
results = self.engine.search('testing', autoboost=True)
|
||||
assertExpected(results, [3, 1, 2, 4, 5])
|
||||
|
||||
self.engine.boost('t2')
|
||||
results = self.engine.search('testing', autoboost=True)
|
||||
assertExpected(results, [2, 3, 1, 4, 5])
|
||||
|
||||
self.engine.boost('t1', negative=True)
|
||||
results = self.engine.search('testing', autoboost=True)
|
||||
assertExpected(results, [2, 3, 4, 5, 1])
|
||||
|
||||
results = self.engine.search('testing', boosts={'t5': 4.0}, autoboost=True)
|
||||
assertExpected(results, [5, 2, 3, 4, 1])
|
||||
|
||||
results = self.engine.search('testing', boosts={'t3': 1.5}, autoboost=True)
|
||||
assertExpected(results, [3, 2, 4, 5, 1])
|
||||
|
||||
def test_limit(self):
|
||||
self.store_data()
|
||||
|
||||
results = self.engine.search_json('testing', limit=1)
|
||||
self.assertEqual(results, [
|
||||
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
|
||||
])
|
||||
|
||||
def test_filters(self):
|
||||
self.store_data()
|
||||
|
||||
f = lambda i: i['secret'] == 'herp'
|
||||
results = self.engine.search_json('testing python', filters=[f])
|
||||
|
||||
self.assertEqual(self.sort_results(results), [
|
||||
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
|
||||
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
|
||||
])
|
||||
|
||||
def test_simple(self):
|
||||
self.engine.print_scores = True
|
||||
self.engine.store('testing python')
|
||||
self.engine.store('testing python code')
|
||||
self.engine.store('web testing python code')
|
||||
self.engine.store('unit tests with python')
|
||||
|
||||
results = self.engine.search('testing')
|
||||
self.assertEqual(results, ['testing python', 'testing python code', 'web testing python code'])
|
||||
|
||||
results = self.engine.search('code')
|
||||
self.assertEqual(results, ['testing python code', 'web testing python code'])
|
||||
|
||||
def test_correct_sorting(self):
|
||||
strings = []
|
||||
for i in range(26):
|
||||
strings.append('aaaa%s' % chr(i + ord('a')))
|
||||
if i > 0:
|
||||
strings.append('aaa%sa' % chr(i + ord('a')))
|
||||
|
||||
random.shuffle(strings)
|
||||
|
||||
for s in strings:
|
||||
self.engine.store(s)
|
||||
|
||||
results = self.engine.search('aaa')
|
||||
self.assertEqual(results, sorted(strings))
|
||||
|
||||
results = self.engine.search('aaa', limit=30)
|
||||
self.assertEqual(results, sorted(strings)[:30])
|
||||
|
||||
def test_removing_objects(self):
|
||||
self.store_data()
|
||||
|
||||
self.engine.remove(1)
|
||||
|
||||
results = self.engine.search_json('testing')
|
||||
self.assertEqual(self.sort_results(results), [
|
||||
{'obj_id': 2, 'title': 'testing python code', 'secret': 'derp'},
|
||||
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
|
||||
])
|
||||
|
||||
self.store_data(1)
|
||||
self.engine.remove(2)
|
||||
|
||||
results = self.engine.search_json('testing')
|
||||
self.assertEqual(self.sort_results(results), [
|
||||
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
|
||||
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
|
||||
])
|
||||
|
||||
def test_clean_phrase(self):
|
||||
self.assertEqual(self.engine.clean_phrase('abc def ghi'), ['abc', 'def', 'ghi'])
|
||||
|
||||
self.assertEqual(self.engine.clean_phrase('a A tHe an a'), [])
|
||||
self.assertEqual(self.engine.clean_phrase(''), [])
|
||||
|
||||
self.assertEqual(
|
||||
self.engine.clean_phrase('The Best of times, the blurst of times'),
|
||||
['best', 'times', 'blurst', 'times'])
|
||||
|
||||
def test_exists(self):
|
||||
self.assertFalse(self.engine.exists('test'))
|
||||
self.engine.store('test')
|
||||
self.assertTrue(self.engine.exists('test'))
|
||||
|
||||
def test_removing_objects_in_depth(self):
|
||||
# want to ensure that redis is cleaned up and does not become polluted
|
||||
# with spurious keys when objects are removed
|
||||
redis_client = self.engine.client
|
||||
prefix = self.engine.prefix
|
||||
|
||||
initial_key_count = len(redis_client.keys())
|
||||
|
||||
# store the blog "testing python"
|
||||
self.store_data(1)
|
||||
|
||||
# see how many keys we have in the db - check again in a bit
|
||||
key_len = len(redis_client.keys())
|
||||
|
||||
self.store_data(2)
|
||||
key_len2 = len(redis_client.keys())
|
||||
|
||||
self.assertTrue(key_len != key_len2)
|
||||
self.engine.remove(2)
|
||||
|
||||
# back to the original amount of keys
|
||||
self.assertEqual(len(redis_client.keys()), key_len)
|
||||
|
||||
self.engine.remove(1)
|
||||
self.assertEqual(len(redis_client.keys()), initial_key_count)
|
||||
|
||||
def test_updating(self):
|
||||
self.engine.store('id1', 'title one', 'd1', 't1')
|
||||
self.engine.store('id2', 'title two', 'd2', 't2')
|
||||
self.engine.store('id3', 'title three', 'd3', 't3')
|
||||
|
||||
results = self.engine.search('tit')
|
||||
self.assertEqual(results, ['d1', 'd3', 'd2'])
|
||||
|
||||
# overwrite the data for id1
|
||||
self.engine.store('id1', 'title one', 'D1', 't1')
|
||||
|
||||
results = self.engine.search('tit')
|
||||
self.assertEqual(results, ['D1', 'd3', 'd2'])
|
||||
|
||||
# overwrite the data with a new title, will remove the title one refs
|
||||
self.engine.store('id1', 'Herple One', 'done', 't1')
|
||||
|
||||
results = self.engine.search('tit')
|
||||
self.assertEqual(results, ['d3', 'd2'])
|
||||
|
||||
results = self.engine.search('her')
|
||||
self.assertEqual(results, ['done'])
|
||||
|
||||
self.engine.store('id1', 'title one', 'Done', 't1', False)
|
||||
results = self.engine.search('tit')
|
||||
self.assertEqual(results, ['Done', 'd3', 'd2'])
|
||||
|
||||
# this shows that when we don't clean up crap gets left around
|
||||
results = self.engine.search('her')
|
||||
self.assertEqual(results, ['Done'])
|
Loading…
Add table
Reference in a new issue