2012-03-11 16:13:05 -07:00
|
|
|
import pyes
|
2014-04-11 15:40:58 -07:00
|
|
|
from pyes.query import FuzzyQuery, MatchQuery, PrefixQuery
|
2012-03-11 16:13:05 -07:00
|
|
|
from django.conf import settings
|
2012-12-21 15:05:38 -08:00
|
|
|
from django.contrib.auth.models import User
|
|
|
|
from utils import log as logging
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
class SearchStory:
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2012-07-13 14:33:16 -07:00
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
|
2014-04-11 15:40:58 -07:00
|
|
|
name = "stories"
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
@classmethod
|
|
|
|
def index_name(cls):
|
|
|
|
return "%s-index" % cls.name
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def type_name(cls):
|
|
|
|
return "%s-type" % cls.name
|
|
|
|
|
2012-03-11 16:13:05 -07:00
|
|
|
@classmethod
|
|
|
|
def create_elasticsearch_mapping(cls):
|
2012-07-13 14:33:16 -07:00
|
|
|
cls.ES.create_index("%s-index" % cls.name)
|
2012-03-11 16:13:05 -07:00
|
|
|
mapping = {
|
|
|
|
'title': {
|
|
|
|
'boost': 2.0,
|
|
|
|
'index': 'analyzed',
|
|
|
|
'store': 'yes',
|
|
|
|
'type': 'string',
|
|
|
|
"term_vector" : "with_positions_offsets"
|
|
|
|
},
|
|
|
|
'content': {
|
|
|
|
'boost': 1.0,
|
|
|
|
'index': 'analyzed',
|
|
|
|
'store': 'yes',
|
|
|
|
'type': 'string',
|
|
|
|
"term_vector" : "with_positions_offsets"
|
|
|
|
},
|
|
|
|
'author': {
|
|
|
|
'boost': 1.0,
|
|
|
|
'index': 'analyzed',
|
|
|
|
'store': 'yes',
|
2012-12-20 16:07:22 -08:00
|
|
|
'type': 'string',
|
|
|
|
},
|
|
|
|
'db_id': {
|
|
|
|
'index': 'not_analyzed',
|
|
|
|
'store': 'yes',
|
|
|
|
'type': 'string',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
|
|
|
'feed_id': {
|
|
|
|
'store': 'yes',
|
|
|
|
'type': 'integer'
|
|
|
|
},
|
|
|
|
'date': {
|
|
|
|
'store': 'yes',
|
|
|
|
'type': 'date',
|
2012-07-13 14:33:16 -07:00
|
|
|
},
|
|
|
|
'user_ids': {
|
|
|
|
'index': 'not_analyzed',
|
|
|
|
'store': 'yes',
|
|
|
|
'type': 'integer',
|
|
|
|
'index_name': 'user_id'
|
2012-03-11 16:13:05 -07:00
|
|
|
}
|
|
|
|
}
|
2012-07-13 14:33:16 -07:00
|
|
|
cls.ES.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
2012-03-11 16:13:05 -07:00
|
|
|
|
|
|
|
@classmethod
|
2012-12-20 16:07:22 -08:00
|
|
|
def index(cls, user_id, story_id, story_title, story_content, story_author, story_date, db_id):
|
2012-03-11 16:13:05 -07:00
|
|
|
doc = {
|
|
|
|
"content": story_content,
|
|
|
|
"title": story_title,
|
|
|
|
"author": story_author,
|
2012-07-13 14:33:16 -07:00
|
|
|
"date": story_date,
|
|
|
|
"user_ids": user_id,
|
2012-12-20 16:07:22 -08:00
|
|
|
"db_id": db_id,
|
2012-03-11 16:13:05 -07:00
|
|
|
}
|
2012-07-13 14:33:16 -07:00
|
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_id)
|
2012-03-11 16:13:05 -07:00
|
|
|
|
|
|
|
@classmethod
|
2012-07-13 14:33:16 -07:00
|
|
|
def query(cls, user_id, text):
|
2012-12-21 15:05:38 -08:00
|
|
|
user = User.objects.get(pk=user_id)
|
2012-07-13 14:33:16 -07:00
|
|
|
cls.ES.refresh()
|
2012-03-11 16:13:05 -07:00
|
|
|
q = pyes.query.StringQuery(text)
|
2014-04-11 15:40:58 -07:00
|
|
|
print q.serialized(), cls.index_name, cls.type_name
|
|
|
|
results = cls.ES.search(q, indices=cls.index_name, doc_types=[cls.type_name])
|
|
|
|
logging.user(user, "~FGSearch ~FCstories~FG for: ~SB%s" % text)
|
2012-12-20 16:07:22 -08:00
|
|
|
|
|
|
|
if not results.total:
|
2014-04-11 15:40:58 -07:00
|
|
|
logging.user(user, "~FGSearch ~FCstories~FG by title: ~SB%s" % text)
|
2013-01-04 16:34:27 -08:00
|
|
|
q = FuzzyQuery('title', text)
|
2012-12-20 16:07:22 -08:00
|
|
|
results = cls.ES.search(q)
|
|
|
|
|
|
|
|
if not results.total:
|
2014-04-11 15:40:58 -07:00
|
|
|
logging.user(user, "~FGSearch ~FCstories~FG by content: ~SB%s" % text)
|
2013-01-04 16:34:27 -08:00
|
|
|
q = FuzzyQuery('content', text)
|
2012-12-20 16:07:22 -08:00
|
|
|
results = cls.ES.search(q)
|
|
|
|
|
|
|
|
if not results.total:
|
2014-04-11 15:40:58 -07:00
|
|
|
logging.user(user, "~FGSearch ~FCstories~FG by author: ~SB%s" % text)
|
2013-01-04 16:34:27 -08:00
|
|
|
q = FuzzyQuery('author', text)
|
2012-12-20 16:07:22 -08:00
|
|
|
results = cls.ES.search(q)
|
|
|
|
|
2012-03-11 16:13:05 -07:00
|
|
|
return results
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
|
|
|
|
class SearchFeed:
|
|
|
|
|
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
|
|
|
|
name = "feeds"
|
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
@classmethod
|
|
|
|
def index_name(cls):
|
|
|
|
return "%s-index" % cls.name
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def type_name(cls):
|
|
|
|
return "%s-type" % cls.name
|
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
@classmethod
|
|
|
|
def create_elasticsearch_mapping(cls):
|
2014-04-11 15:40:58 -07:00
|
|
|
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
|
2013-01-04 16:34:27 -08:00
|
|
|
settings = {
|
|
|
|
"index" : {
|
2014-04-11 16:04:30 -07:00
|
|
|
"analysis": {
|
|
|
|
"analyzer": {
|
|
|
|
"edgengram_analyzer": {
|
|
|
|
"filter": ["edgengram"],
|
|
|
|
"tokenizer": "lowercase",
|
|
|
|
"type": "custom"
|
|
|
|
},
|
|
|
|
"ngram_analyzer": {
|
|
|
|
"filter": ["ngram"],
|
|
|
|
"tokenizer": "lowercase",
|
|
|
|
"type": "custom"
|
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"filter": {
|
|
|
|
"edgengram": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "2",
|
|
|
|
"type": "edgeNGram"
|
|
|
|
},
|
|
|
|
"ngram": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "3",
|
|
|
|
"type": "nGram"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"tokenizer": {
|
|
|
|
"edgengram_tokenizer": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "2",
|
|
|
|
"side": "front",
|
|
|
|
"type": "edgeNGram"
|
|
|
|
},
|
|
|
|
"ngram_tokenizer": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "3",
|
|
|
|
"type": "nGram"
|
|
|
|
}
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2014-04-11 16:04:30 -07:00
|
|
|
}
|
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
}
|
|
|
|
cls.ES.indices.create_index("%s-index" % cls.name, settings)
|
|
|
|
|
2014-04-11 16:04:30 -07:00
|
|
|
mapping = {
|
|
|
|
"address": {
|
|
|
|
"analyzer": "edgengram_analyzer",
|
|
|
|
"store": True,
|
|
|
|
"term_vector": "with_positions_offsets",
|
|
|
|
"type": "string"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"feed_id": {
|
|
|
|
"store": True,
|
|
|
|
"type": "string"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"num_subscribers": {
|
|
|
|
"index": "analyzed",
|
|
|
|
"store": True,
|
|
|
|
"type": "long"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"title": {
|
|
|
|
"analyzer": "edgengram_analyzer",
|
|
|
|
"store": True,
|
|
|
|
"term_vector": "with_positions_offsets",
|
|
|
|
"type": "string"
|
|
|
|
}
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def index(cls, feed_id, title, address, link, num_subscribers):
|
|
|
|
doc = {
|
|
|
|
"feed_id": feed_id,
|
|
|
|
"title": title,
|
|
|
|
"address": address,
|
|
|
|
"link": link,
|
|
|
|
"num_subscribers": num_subscribers,
|
|
|
|
}
|
|
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, feed_id)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def query(cls, text):
|
2014-04-11 15:40:58 -07:00
|
|
|
cls.ES.default_indices = cls.index_name()
|
|
|
|
cls.ES.indices.refresh()
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text)
|
2014-04-11 16:04:30 -07:00
|
|
|
q = MatchQuery('address', text)
|
2014-04-11 15:40:58 -07:00
|
|
|
print q.serialize(), cls.index_name(), cls.type_name()
|
|
|
|
results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5,
|
|
|
|
doc_types=[cls.type_name()])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
if not results.total:
|
|
|
|
logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text)
|
2014-04-11 16:04:30 -07:00
|
|
|
q = MatchQuery('title', text, operator="and")
|
2014-04-11 15:40:58 -07:00
|
|
|
print q.serialize()
|
|
|
|
results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5,
|
|
|
|
doc_types=[cls.type_name()])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
if not results.total:
|
|
|
|
logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text)
|
2014-04-11 16:04:30 -07:00
|
|
|
q = MatchQuery('link', text)
|
2014-04-11 15:40:58 -07:00
|
|
|
print q.serialize()
|
|
|
|
results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5,
|
|
|
|
doc_types=[cls.type_name()])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
return results
|