Adding new autocomplete. Let's hope this doesn't destroy performance in Redis.

This commit is contained in:
Samuel Clay 2013-04-08 12:54:02 -07:00
parent cb90344cda
commit c25c4478e6
7 changed files with 1138 additions and 22 deletions

View file

@ -8,6 +8,7 @@ import mongoengine as mongo
import zlib
import hashlib
import redis
from urlparse import urlparse
from utils.feed_functions import Counter
from collections import defaultdict
from operator import itemgetter
@ -36,6 +37,7 @@ from utils.feed_functions import timelimit, TimeoutError
from utils.feed_functions import relative_timesince
from utils.feed_functions import seconds_timesince
from utils.story_functions import strip_tags, htmldiff, strip_comments
from vendor.redis_completion.engine import RedisEngine
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -191,7 +193,6 @@ class Feed(models.Model):
try:
super(Feed, self).save(*args, **kwargs)
return self
except IntegrityError, e:
logging.debug(" ---> ~FRFeed save collision (%s), checking dupe..." % e)
duplicate_feeds = Feed.objects.filter(feed_address=self.feed_address,
@ -209,8 +210,10 @@ class Feed(models.Model):
logging.debug(" ---> ~FRFound different feed (%s), merging..." % duplicate_feeds[0])
feed = Feed.get_by_id(merge_feeds(duplicate_feeds[0].pk, self.pk))
return feed
return self
self.sync_autocompletion()
return self
def index_for_search(self):
if self.num_subscribers > 1 and not self.branch_from_feed:
@ -223,6 +226,31 @@ class Feed(models.Model):
def sync_redis(self):
return MStory.sync_all_redis(self.pk)
def sync_autocompletion(self):
if self.num_subscribers <= 1: return
if self.branch_from_feed: return
if any(t in self.feed_address for t in ['token', 'private']): return
engine = RedisEngine(prefix="FT", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
engine.store(self.pk, title=self.feed_title)
engine.boost(self.pk, self.num_subscribers)
parts = urlparse(self.feed_address)
engine = RedisEngine(prefix="FA", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
engine.store(self.pk, title=parts.hostname)
engine.boost(self.pk, self.num_subscribers)
@classmethod
def autocomplete(self, prefix, limit=5):
engine = RedisEngine(prefix="FA", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
results = engine.search(phrase=prefix, limit=limit, autoboost=True)
if len(results) < limit:
engine = RedisEngine(prefix="FT", connection_pool=settings.REDIS_AUTOCOMPLETE_POOL)
results += engine.search(phrase=prefix, limit=limit-len(results), autoboost=True, filters=[lambda f: f not in results])
return results
@classmethod
def find_or_create(cls, feed_address, feed_link, *args, **kwargs):

View file

@ -74,29 +74,14 @@ def feed_autocomplete(request):
query = request.GET.get('term')
version = int(request.GET.get('v', 1))
if True or not user.profile.is_premium:
return dict(code=-1, message="Overloaded, no autocomplete results.", feeds=[], term=query)
# if True or not user.profile.is_premium:
# return dict(code=-1, message="Overloaded, no autocomplete results.", feeds=[], term=query)
if not query:
return dict(code=-1, message="Specify a search 'term'.", feeds=[], term=query)
feeds = []
for field in ['feed_address', 'feed_title', 'feed_link']:
if not feeds:
feeds = Feed.objects.filter(**{
'%s__icontains' % field: query,
'num_subscribers__gt': 1,
'branch_from_feed__isnull': True,
}).exclude(
Q(**{'%s__icontains' % field: 'token'}) |
Q(**{'%s__icontains' % field: 'private'})
).only(
'id',
'feed_title',
'feed_address',
'num_subscribers'
).select_related("data").order_by('-num_subscribers')[:5]
feed_ids = Feed.autocomplete(query)
feeds = [Feed.get_by_id(feed_id) for feed_id in feed_ids]
feeds = [{
'id': feed.pk,
'value': feed.feed_address,
@ -104,6 +89,7 @@ def feed_autocomplete(request):
'tagline': feed.data and feed.data.feed_tagline,
'num_subscribers': feed.num_subscribers,
} for feed in feeds]
feeds = sorted(feeds, key=lambda f: -1 * f['num_subscribers'])
feed_ids = [f['id'] for f in feeds]
feed_icons = dict((icon.feed_id, icon) for icon in MFeedIcon.objects.filter(feed_id__in=feed_ids))

View file

@ -549,6 +549,8 @@ REDIS_ANALYTICS_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=2)
REDIS_STATISTICS_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=3)
REDIS_FEED_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=4)
REDIS_SESSION_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=5)
# DB 6 = Session Store
REDIS_AUTOCOMPLETE_POOL = redis.ConnectionPool(host=REDIS['host'], port=6379, db=7)
JAMMIT = jammit.JammitAssets(NEWSBLUR_DIR)

1
vendor/redis_completion/__init__.py vendored Executable file
View file

@ -0,0 +1 @@
from redis_completion.engine import RedisEngine

228
vendor/redis_completion/engine.py vendored Executable file
View file

@ -0,0 +1,228 @@
try:
import simplejson as json
except ImportError:
import json
import re
from redis import Redis
from redis_completion.stop_words import STOP_WORDS as _STOP_WORDS
# aggressive stop words will be better when the length of the document is longer
AGGRESSIVE_STOP_WORDS = _STOP_WORDS
# default stop words should work fine for titles and things like that
DEFAULT_STOP_WORDS = set(['a', 'an', 'of', 'the'])
class RedisEngine(object):
"""
References
----------
http://antirez.com/post/autocomplete-with-redis.html
http://stackoverflow.com/questions/1958005/redis-autocomplete/1966188#1966188
http://patshaughnessy.net/2011/11/29/two-ways-of-using-redis-to-build-a-nosql-autocomplete-search-index
"""
def __init__(self, prefix='ac', stop_words=None, cache_timeout=300, **conn_kwargs):
self.prefix = prefix
self.stop_words = (stop_words is None) and DEFAULT_STOP_WORDS or stop_words
self.conn_kwargs = conn_kwargs
self.client = self.get_client()
self.cache_timeout = cache_timeout
self.boost_key = '%s:b' % self.prefix
self.data_key = '%s:d' % self.prefix
self.title_key = '%s:t' % self.prefix
self.search_key = lambda k: '%s:s:%s' % (self.prefix, k)
self.cache_key = lambda pk, bk: '%s:c:%s:%s' % (self.prefix, pk, bk)
self.kcombine = lambda _id, _type: str(_id)
self.ksplit = lambda k: k
def get_client(self):
return Redis(**self.conn_kwargs)
def score_key(self, k, max_size=20):
k_len = len(k)
a = ord('a') - 2
score = 0
for i in range(max_size):
if i < k_len:
c = (ord(k[i]) - a)
if c < 2 or c > 27:
c = 1
else:
c = 1
score += c*(27**(max_size-i))
return score
def clean_phrase(self, phrase):
phrase = re.sub('[^a-z0-9_\-\s]', '', phrase.lower())
return [w for w in phrase.split() if w not in self.stop_words]
def create_key(self, phrase):
return ' '.join(self.clean_phrase(phrase))
def autocomplete_keys(self, w):
for i in range(1, len(w)):
yield w[:i]
yield w
def flush(self, everything=False, batch_size=1000):
if everything:
return self.client.flushdb()
# this could be expensive :-(
keys = self.client.keys('%s:*' % self.prefix)
# batch keys
for i in range(0, len(keys), batch_size):
self.client.delete(*keys[i:i+batch_size])
def store(self, obj_id, title=None, data=None, obj_type=None, check_exist=True):
if title is None:
title = obj_id
if data is None:
data = title
title_score = self.score_key(self.create_key(title))
combined_id = self.kcombine(obj_id, obj_type or '')
if check_exist and self.exists(obj_id, obj_type):
stored_title = self.client.hget(self.title_key, combined_id)
# if the stored title is the same, we can simply update the data key
# since everything else will have stayed the same
if stored_title == title:
self.client.hset(self.data_key, combined_id, data)
return
else:
self.remove(obj_id, obj_type)
pipe = self.client.pipeline()
pipe.hset(self.data_key, combined_id, data)
pipe.hset(self.title_key, combined_id, title)
for word in self.clean_phrase(title):
for partial_key in self.autocomplete_keys(word):
pipe.zadd(self.search_key(partial_key), combined_id, title_score)
pipe.execute()
def store_json(self, obj_id, title, data_dict, obj_type=None):
return self.store(obj_id, title, json.dumps(data_dict), obj_type)
def remove(self, obj_id, obj_type=None):
obj_id = self.kcombine(obj_id, obj_type or '')
title = self.client.hget(self.title_key, obj_id) or ''
keys = []
for word in self.clean_phrase(title):
for partial_key in self.autocomplete_keys(word):
key = self.search_key(partial_key)
if not self.client.zrange(key, 1, 2):
self.client.delete(key)
else:
self.client.zrem(key, obj_id)
self.client.hdel(self.data_key, obj_id)
self.client.hdel(self.title_key, obj_id)
self.client.hdel(self.boost_key, obj_id)
def boost(self, obj_id, multiplier=1.1, negative=False):
# take the existing boost for this item and increase it by the multiplier
current = self.client.hget(self.boost_key, obj_id)
current_f = float(current or 1.0)
if negative:
multiplier = 1 / multiplier
self.client.hset(self.boost_key, obj_id, current_f * multiplier)
def exists(self, obj_id, obj_type=None):
obj_id = self.kcombine(obj_id, obj_type or '')
return self.client.hexists(self.data_key, obj_id)
def get_cache_key(self, phrases, boosts):
if boosts:
boost_key = '|'.join('%s:%s' % (k, v) for k, v in sorted(boosts.items()))
else:
boost_key = ''
phrase_key = '|'.join(phrases)
return self.cache_key(phrase_key, boost_key)
def _process_ids(self, id_list, limit, filters, mappers):
ct = 0
data = []
for raw_id in id_list:
# raw_data = self.client.hget(self.data_key, raw_id)
raw_data = raw_id
if not raw_data:
continue
if mappers:
for m in mappers:
raw_data = m(raw_data)
if filters:
passes = True
for f in filters:
if not f(raw_data):
passes = False
break
if not passes:
continue
data.append(raw_data)
ct += 1
if limit and ct == limit:
break
return data
def search(self, phrase, limit=None, filters=None, mappers=None, boosts=None, autoboost=False):
cleaned = self.clean_phrase(phrase)
if not cleaned:
return []
if autoboost:
boosts = boosts or {}
stored = self.client.hgetall(self.boost_key)
for obj_id in stored:
if obj_id not in boosts:
boosts[obj_id] = float(stored[obj_id])
if len(cleaned) == 1 and not boosts:
new_key = self.search_key(cleaned[0])
else:
new_key = self.get_cache_key(cleaned, boosts)
if not self.client.exists(new_key):
# zinterstore also takes {k1: wt1, k2: wt2}
self.client.zinterstore(new_key, map(self.search_key, cleaned))
self.client.expire(new_key, self.cache_timeout)
if boosts:
pipe = self.client.pipeline()
for raw_id, score in self.client.zrange(new_key, 0, -1, withscores=True):
orig_score = score
for part in self.ksplit(raw_id):
if part and part in boosts:
score *= 1 / boosts[part]
if orig_score != score:
pipe.zadd(new_key, raw_id, score)
pipe.execute()
id_list = self.client.zrange(new_key, 0, -1)
# return id_list
return self._process_ids(id_list, limit, filters, mappers)
def search_json(self, phrase, limit=None, filters=None, mappers=None, boosts=None, autoboost=False):
if not mappers:
mappers = []
mappers.insert(0, json.loads)
return self.search(phrase, limit, filters, mappers, boosts, autoboost)

594
vendor/redis_completion/stop_words.py vendored Executable file
View file

@ -0,0 +1,594 @@
words = """a
a's
able
about
above
according
accordingly
across
actually
after
afterwards
again
against
ain't
all
allow
allows
almost
alone
along
already
also
although
always
am
among
amongst
amoungst
amount
an
and
another
any
anybody
anyhow
anyone
anything
anyway
anyways
anywhere
apart
appear
appreciate
appropriate
are
aren't
around
as
aside
ask
asking
associated
at
available
away
awfully
back
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
believe
below
beside
besides
best
better
between
beyond
bill
both
bottom
brief
but
by
c'mon
c's
call
came
can
can't
cannot
cant
cause
causes
certain
certainly
changes
clearly
co
com
come
comes
computer
con
concerning
consequently
consider
considering
contain
containing
contains
corresponding
could
couldn't
couldnt
course
cry
currently
de
definitely
describe
described
despite
detail
did
didn't
different
do
does
doesn't
doing
don't
done
down
downwards
due
during
each
edu
eg
eight
either
eleven
else
elsewhere
empty
enough
entirely
especially
et
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
exactly
example
except
far
few
fifteen
fifth
fify
fill
find
fire
first
five
followed
following
follows
for
former
formerly
forth
forty
found
four
from
front
full
further
furthermore
get
gets
getting
give
given
gives
go
goes
going
gone
got
gotten
greetings
had
hadn't
happens
hardly
has
hasn't
hasnt
have
haven't
having
he
he's
hello
help
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
hi
him
himself
his
hither
hopefully
how
howbeit
however
hundred
i
i'd
i'll
i'm
i've
ie
if
ignored
immediate
in
inasmuch
inc
indeed
indicate
indicated
indicates
inner
insofar
instead
interest
into
inward
is
isn't
it
it'd
it'll
it's
its
itself
just
keep
keeps
kept
know
known
knows
last
lately
later
latter
latterly
least
less
lest
let
let's
like
liked
likely
little
look
looking
looks
ltd
made
mainly
many
may
maybe
me
mean
meanwhile
merely
might
mill
mine
more
moreover
most
mostly
move
much
must
my
myself
name
namely
nd
near
nearly
necessary
need
needs
neither
never
nevertheless
new
next
nine
no
nobody
non
none
noone
nor
normally
not
nothing
novel
now
nowhere
obviously
of
off
often
oh
ok
okay
old
on
once
one
ones
only
onto
or
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
own
part
particular
particularly
per
perhaps
placed
please
plus
possible
presumably
probably
provides
put
que
quite
qv
rather
rd
re
really
reasonably
regarding
regardless
regards
relatively
respectively
right
said
same
saw
say
saying
says
second
secondly
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sensible
sent
serious
seriously
seven
several
shall
she
should
shouldn't
show
side
since
sincere
six
sixty
so
some
somebody
somehow
someone
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specified
specify
specifying
still
sub
such
sup
sure
system
t's
take
taken
tell
ten
tends
th
than
thank
thanks
thanx
that
that's
thats
the
their
theirs
them
themselves
then
thence
there
there's
thereafter
thereby
therefore
therein
theres
thereupon
these
they
they'd
they'll
they're
they've
thick
thin
think
third
this
thorough
thoroughly
those
though
three
through
throughout
thru
thus
to
together
too
took
top
toward
towards
tried
tries
truly
try
trying
twelve
twenty
twice
two
un
under
unfortunately
unless
unlikely
until
unto
up
upon
us
use
used
useful
uses
using
usually
value
various
very
via
viz
vs
want
wants
was
wasn't
way
we
we'd
we'll
we're
we've
welcome
well
went
were
weren't
what
what's
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who's
whoever
whole
whom
whose
why
will
willing
wish
with
within
without
won't
wonder
would
wouldn't
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
zero"""
STOP_WORDS = set([
w.strip() for w in words.splitlines() if w
])

277
vendor/redis_completion/tests.py vendored Executable file
View file

@ -0,0 +1,277 @@
import random
from unittest import TestCase
from redis_completion.engine import RedisEngine
stop_words = set(['a', 'an', 'the', 'of'])
class RedisCompletionTestCase(TestCase):
def setUp(self):
self.engine = self.get_engine()
self.engine.flush()
def get_engine(self):
return RedisEngine(prefix='testac', db=15)
def store_data(self, id=None):
test_data = (
(1, 'testing python'),
(2, 'testing python code'),
(3, 'web testing python code'),
(4, 'unit tests with python'),
)
for obj_id, title in test_data:
if id is None or id == obj_id:
self.engine.store_json(obj_id, title, {
'obj_id': obj_id,
'title': title,
'secret': obj_id % 2 == 0 and 'derp' or 'herp',
})
def sort_results(self, r):
return sorted(r, key=lambda i:i['obj_id'])
def test_search(self):
self.store_data()
results = self.engine.search_json('testing python')
self.assertEqual(self.sort_results(results), [
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
{'obj_id': 2, 'title': 'testing python code', 'secret': 'derp'},
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
])
results = self.engine.search_json('test')
self.assertEqual(self.sort_results(results), [
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
{'obj_id': 2, 'title': 'testing python code', 'secret': 'derp'},
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
{'obj_id': 4, 'title': 'unit tests with python', 'secret': 'derp'},
])
results = self.engine.search_json('unit')
self.assertEqual(results, [
{'obj_id': 4, 'title': 'unit tests with python', 'secret': 'derp'},
])
results = self.engine.search_json('')
self.assertEqual(results, [])
results = self.engine.search_json('missing')
self.assertEqual(results, [])
def test_boosting(self):
test_data = (
(1, 'test alpha', 't1'),
(2, 'test beta', 't1'),
(3, 'test gamma', 't1'),
(4, 'test delta', 't1'),
(5, 'test alpha', 't2'),
(6, 'test beta', 't2'),
(7, 'test gamma', 't2'),
(8, 'test delta', 't2'),
(9, 'test alpha', 't3'),
(10, 'test beta', 't3'),
(11, 'test gamma', 't3'),
(12, 'test delta', 't3'),
)
for obj_id, title, obj_type in test_data:
self.engine.store_json(obj_id, title, {
'obj_id': obj_id,
'title': title,
}, obj_type)
def assertExpected(results, id_list):
self.assertEqual([r['obj_id'] for r in results], id_list)
results = self.engine.search_json('alp')
assertExpected(results, [1, 5, 9])
results = self.engine.search_json('alp', boosts={'t2': 1.1})
assertExpected(results, [5, 1, 9])
results = self.engine.search_json('test', boosts={'t3': 1.5, 't2': 1.1})
assertExpected(results, [9, 10, 12, 11, 5, 6, 8, 7, 1, 2, 4, 3])
results = self.engine.search_json('alp', boosts={'t1': 0.5})
assertExpected(results, [5, 9, 1])
results = self.engine.search_json('alp', boosts={'t1': 1.5, 't3': 1.6})
assertExpected(results, [9, 1, 5])
results = self.engine.search_json('alp', boosts={'t3': 1.5, '5': 1.6})
assertExpected(results, [5, 9, 1])
def test_autoboost(self):
self.engine.store('t1', 'testing 1')
self.engine.store('t2', 'testing 2')
self.engine.store('t3', 'testing 3')
self.engine.store('t4', 'testing 4')
self.engine.store('t5', 'testing 5')
def assertExpected(results, id_list):
self.assertEqual(results, ['testing %s' % i for i in id_list])
results = self.engine.search('testing', autoboost=True)
assertExpected(results, [1, 2, 3, 4, 5])
self.engine.boost('t3')
results = self.engine.search('testing', autoboost=True)
assertExpected(results, [3, 1, 2, 4, 5])
self.engine.boost('t2')
results = self.engine.search('testing', autoboost=True)
assertExpected(results, [2, 3, 1, 4, 5])
self.engine.boost('t1', negative=True)
results = self.engine.search('testing', autoboost=True)
assertExpected(results, [2, 3, 4, 5, 1])
results = self.engine.search('testing', boosts={'t5': 4.0}, autoboost=True)
assertExpected(results, [5, 2, 3, 4, 1])
results = self.engine.search('testing', boosts={'t3': 1.5}, autoboost=True)
assertExpected(results, [3, 2, 4, 5, 1])
def test_limit(self):
self.store_data()
results = self.engine.search_json('testing', limit=1)
self.assertEqual(results, [
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
])
def test_filters(self):
self.store_data()
f = lambda i: i['secret'] == 'herp'
results = self.engine.search_json('testing python', filters=[f])
self.assertEqual(self.sort_results(results), [
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
])
def test_simple(self):
self.engine.print_scores = True
self.engine.store('testing python')
self.engine.store('testing python code')
self.engine.store('web testing python code')
self.engine.store('unit tests with python')
results = self.engine.search('testing')
self.assertEqual(results, ['testing python', 'testing python code', 'web testing python code'])
results = self.engine.search('code')
self.assertEqual(results, ['testing python code', 'web testing python code'])
def test_correct_sorting(self):
strings = []
for i in range(26):
strings.append('aaaa%s' % chr(i + ord('a')))
if i > 0:
strings.append('aaa%sa' % chr(i + ord('a')))
random.shuffle(strings)
for s in strings:
self.engine.store(s)
results = self.engine.search('aaa')
self.assertEqual(results, sorted(strings))
results = self.engine.search('aaa', limit=30)
self.assertEqual(results, sorted(strings)[:30])
def test_removing_objects(self):
self.store_data()
self.engine.remove(1)
results = self.engine.search_json('testing')
self.assertEqual(self.sort_results(results), [
{'obj_id': 2, 'title': 'testing python code', 'secret': 'derp'},
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
])
self.store_data(1)
self.engine.remove(2)
results = self.engine.search_json('testing')
self.assertEqual(self.sort_results(results), [
{'obj_id': 1, 'title': 'testing python', 'secret': 'herp'},
{'obj_id': 3, 'title': 'web testing python code', 'secret': 'herp'},
])
def test_clean_phrase(self):
self.assertEqual(self.engine.clean_phrase('abc def ghi'), ['abc', 'def', 'ghi'])
self.assertEqual(self.engine.clean_phrase('a A tHe an a'), [])
self.assertEqual(self.engine.clean_phrase(''), [])
self.assertEqual(
self.engine.clean_phrase('The Best of times, the blurst of times'),
['best', 'times', 'blurst', 'times'])
def test_exists(self):
self.assertFalse(self.engine.exists('test'))
self.engine.store('test')
self.assertTrue(self.engine.exists('test'))
def test_removing_objects_in_depth(self):
# want to ensure that redis is cleaned up and does not become polluted
# with spurious keys when objects are removed
redis_client = self.engine.client
prefix = self.engine.prefix
initial_key_count = len(redis_client.keys())
# store the blog "testing python"
self.store_data(1)
# see how many keys we have in the db - check again in a bit
key_len = len(redis_client.keys())
self.store_data(2)
key_len2 = len(redis_client.keys())
self.assertTrue(key_len != key_len2)
self.engine.remove(2)
# back to the original amount of keys
self.assertEqual(len(redis_client.keys()), key_len)
self.engine.remove(1)
self.assertEqual(len(redis_client.keys()), initial_key_count)
def test_updating(self):
self.engine.store('id1', 'title one', 'd1', 't1')
self.engine.store('id2', 'title two', 'd2', 't2')
self.engine.store('id3', 'title three', 'd3', 't3')
results = self.engine.search('tit')
self.assertEqual(results, ['d1', 'd3', 'd2'])
# overwrite the data for id1
self.engine.store('id1', 'title one', 'D1', 't1')
results = self.engine.search('tit')
self.assertEqual(results, ['D1', 'd3', 'd2'])
# overwrite the data with a new title, will remove the title one refs
self.engine.store('id1', 'Herple One', 'done', 't1')
results = self.engine.search('tit')
self.assertEqual(results, ['d3', 'd2'])
results = self.engine.search('her')
self.assertEqual(results, ['done'])
self.engine.store('id1', 'title one', 'Done', 't1', False)
results = self.engine.search('tit')
self.assertEqual(results, ['Done', 'd3', 'd2'])
# this shows that when we don't clean up crap gets left around
results = self.engine.search('her')
self.assertEqual(results, ['Done'])