2012-03-11 16:13:05 -07:00
|
|
|
import pyes
|
2014-04-15 14:17:15 -07:00
|
|
|
from pyes.query import MatchQuery
|
2012-03-11 16:13:05 -07:00
|
|
|
from django.conf import settings
|
2012-12-21 15:05:38 -08:00
|
|
|
from utils import log as logging
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
class SearchStory:
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2012-07-13 14:33:16 -07:00
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
|
2014-04-11 15:40:58 -07:00
|
|
|
name = "stories"
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
@classmethod
|
|
|
|
def index_name(cls):
|
|
|
|
return "%s-index" % cls.name
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def type_name(cls):
|
|
|
|
return "%s-type" % cls.name
|
|
|
|
|
2012-03-11 16:13:05 -07:00
|
|
|
@classmethod
|
|
|
|
def create_elasticsearch_mapping(cls):
|
2014-04-11 17:25:13 -07:00
|
|
|
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
|
|
|
|
cls.ES.indices.create_index("%s-index" % cls.name)
|
2012-03-11 16:13:05 -07:00
|
|
|
mapping = {
|
|
|
|
'title': {
|
|
|
|
'boost': 2.0,
|
|
|
|
'index': 'analyzed',
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'string',
|
2014-04-15 14:17:15 -07:00
|
|
|
'analyzer': 'snowball',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
|
|
|
'content': {
|
|
|
|
'boost': 1.0,
|
|
|
|
'index': 'analyzed',
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'string',
|
2014-04-15 14:17:15 -07:00
|
|
|
'analyzer': 'snowball',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
|
|
|
'author': {
|
|
|
|
'boost': 1.0,
|
|
|
|
'index': 'analyzed',
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-12-20 16:07:22 -08:00
|
|
|
'type': 'string',
|
2014-04-15 14:17:15 -07:00
|
|
|
'analyzer': 'keyword',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
|
|
|
'feed_id': {
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'integer'
|
|
|
|
},
|
|
|
|
'date': {
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'date',
|
|
|
|
}
|
|
|
|
}
|
2014-04-11 17:25:13 -07:00
|
|
|
cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
2012-03-11 16:13:05 -07:00
|
|
|
|
|
|
|
@classmethod
|
2014-04-15 14:17:15 -07:00
|
|
|
def index(cls, story_hash, story_title, story_content, story_author, story_feed_id,
|
|
|
|
story_date):
|
2012-03-11 16:13:05 -07:00
|
|
|
doc = {
|
2014-04-15 14:59:00 -07:00
|
|
|
"content" : story_content,
|
|
|
|
"title" : story_title,
|
|
|
|
"author" : story_author,
|
|
|
|
"feed_id" : story_feed_id,
|
|
|
|
"date" : story_date,
|
2012-03-11 16:13:05 -07:00
|
|
|
}
|
2014-04-11 17:25:13 -07:00
|
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
|
2014-04-15 14:59:00 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def remove(cls, story_hash):
|
|
|
|
cls.ES.delete("%s-index" % cls.name, "%s-type" % cls.name, story_hash)
|
2012-03-11 16:13:05 -07:00
|
|
|
|
|
|
|
@classmethod
|
2014-04-11 18:02:25 -07:00
|
|
|
def query(cls, feed_ids, query):
|
|
|
|
cls.ES.indices.refresh()
|
2014-04-15 14:17:15 -07:00
|
|
|
|
|
|
|
string_q = pyes.query.StringQuery(query, default_operator="AND")
|
2014-04-15 14:59:00 -07:00
|
|
|
feed_q = pyes.query.TermsQuery('feed_id', feed_ids)
|
|
|
|
q = pyes.query.BoolQuery(must=[string_q, feed_q])
|
|
|
|
results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()])
|
2014-04-15 14:17:15 -07:00
|
|
|
logging.info("~FGSearch ~FCstories~FG for: ~SB%s (across %s feed%s)" %
|
|
|
|
(query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
|
|
|
|
|
|
|
|
return [r.get_id() for r in results]
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
|
|
|
|
class SearchFeed:
|
|
|
|
|
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
|
|
|
|
name = "feeds"
|
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
@classmethod
|
|
|
|
def index_name(cls):
|
|
|
|
return "%s-index" % cls.name
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def type_name(cls):
|
|
|
|
return "%s-type" % cls.name
|
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
@classmethod
|
|
|
|
def create_elasticsearch_mapping(cls):
|
2014-04-11 15:40:58 -07:00
|
|
|
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
|
2013-01-04 16:34:27 -08:00
|
|
|
settings = {
|
|
|
|
"index" : {
|
2014-04-11 16:04:30 -07:00
|
|
|
"analysis": {
|
|
|
|
"analyzer": {
|
|
|
|
"edgengram_analyzer": {
|
|
|
|
"filter": ["edgengram"],
|
|
|
|
"tokenizer": "lowercase",
|
|
|
|
"type": "custom"
|
|
|
|
},
|
|
|
|
"ngram_analyzer": {
|
|
|
|
"filter": ["ngram"],
|
|
|
|
"tokenizer": "lowercase",
|
|
|
|
"type": "custom"
|
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"filter": {
|
|
|
|
"edgengram": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "2",
|
|
|
|
"type": "edgeNGram"
|
|
|
|
},
|
|
|
|
"ngram": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "3",
|
|
|
|
"type": "nGram"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"tokenizer": {
|
|
|
|
"edgengram_tokenizer": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "2",
|
|
|
|
"side": "front",
|
|
|
|
"type": "edgeNGram"
|
|
|
|
},
|
|
|
|
"ngram_tokenizer": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "3",
|
|
|
|
"type": "nGram"
|
|
|
|
}
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2014-04-11 16:04:30 -07:00
|
|
|
}
|
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
}
|
|
|
|
cls.ES.indices.create_index("%s-index" % cls.name, settings)
|
|
|
|
|
2014-04-11 16:04:30 -07:00
|
|
|
mapping = {
|
|
|
|
"address": {
|
|
|
|
"analyzer": "edgengram_analyzer",
|
|
|
|
"store": True,
|
|
|
|
"term_vector": "with_positions_offsets",
|
|
|
|
"type": "string"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"feed_id": {
|
|
|
|
"store": True,
|
|
|
|
"type": "string"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"num_subscribers": {
|
|
|
|
"index": "analyzed",
|
|
|
|
"store": True,
|
|
|
|
"type": "long"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"title": {
|
|
|
|
"analyzer": "edgengram_analyzer",
|
|
|
|
"store": True,
|
|
|
|
"term_vector": "with_positions_offsets",
|
|
|
|
"type": "string"
|
|
|
|
}
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def index(cls, feed_id, title, address, link, num_subscribers):
|
|
|
|
doc = {
|
2014-04-15 14:59:00 -07:00
|
|
|
"feed_id" : feed_id,
|
|
|
|
"title" : title,
|
|
|
|
"address" : address,
|
|
|
|
"link" : link,
|
|
|
|
"num_subscribers" : num_subscribers,
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
|
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, feed_id)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def query(cls, text):
|
2014-04-11 15:40:58 -07:00
|
|
|
cls.ES.default_indices = cls.index_name()
|
|
|
|
cls.ES.indices.refresh()
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text)
|
2014-04-11 16:26:09 -07:00
|
|
|
q = MatchQuery('address', text, operator="and", type="phrase")
|
2014-04-11 15:40:58 -07:00
|
|
|
results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5,
|
|
|
|
doc_types=[cls.type_name()])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
if not results.total:
|
|
|
|
logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text)
|
2014-04-11 16:04:30 -07:00
|
|
|
q = MatchQuery('title', text, operator="and")
|
2014-04-11 15:40:58 -07:00
|
|
|
results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5,
|
|
|
|
doc_types=[cls.type_name()])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
if not results.total:
|
|
|
|
logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text)
|
2014-04-11 16:26:09 -07:00
|
|
|
q = MatchQuery('link', text, operator="and")
|
2014-04-11 15:40:58 -07:00
|
|
|
results = cls.ES.search(query=q, sort="num_subscribers:desc", size=5,
|
|
|
|
doc_types=[cls.type_name()])
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
return results
|