mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-05 16:58:59 +00:00
208 lines
6.8 KiB
Python
208 lines
6.8 KiB
Python
import pyes
|
|
from pyes.query import FilteredQuery, FuzzyQuery, TextQuery, PrefixQuery
|
|
from pyes.filters import RangeFilter
|
|
from pyes.utils import ESRange
|
|
from django.conf import settings
|
|
from django.contrib.auth.models import User
|
|
from utils import log as logging
|
|
|
|
class SearchStarredStory:
|
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
|
|
name = "starred-stories"
|
|
|
|
@classmethod
|
|
def create_elasticsearch_mapping(cls):
|
|
cls.ES.create_index("%s-index" % cls.name)
|
|
mapping = {
|
|
'title': {
|
|
'boost': 2.0,
|
|
'index': 'analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
"term_vector" : "with_positions_offsets"
|
|
},
|
|
'content': {
|
|
'boost': 1.0,
|
|
'index': 'analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
"term_vector" : "with_positions_offsets"
|
|
},
|
|
'author': {
|
|
'boost': 1.0,
|
|
'index': 'analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
},
|
|
'db_id': {
|
|
'index': 'not_analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
},
|
|
'feed_id': {
|
|
'store': 'yes',
|
|
'type': 'integer'
|
|
},
|
|
'date': {
|
|
'store': 'yes',
|
|
'type': 'date',
|
|
},
|
|
'user_ids': {
|
|
'index': 'not_analyzed',
|
|
'store': 'yes',
|
|
'type': 'integer',
|
|
'index_name': 'user_id'
|
|
}
|
|
}
|
|
cls.ES.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
|
|
|
@classmethod
|
|
def index(cls, user_id, story_id, story_title, story_content, story_author, story_date, db_id):
|
|
doc = {
|
|
"content": story_content,
|
|
"title": story_title,
|
|
"author": story_author,
|
|
"date": story_date,
|
|
"user_ids": user_id,
|
|
"db_id": db_id,
|
|
}
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_id)
|
|
|
|
@classmethod
|
|
def query(cls, user_id, text):
|
|
user = User.objects.get(pk=user_id)
|
|
cls.ES.refresh()
|
|
q = pyes.query.StringQuery(text)
|
|
results = cls.ES.search(q)
|
|
logging.user(user, "~FGSearch ~FCsaved stories~FG for: ~SB%s" % text)
|
|
|
|
if not results.total:
|
|
logging.user(user, "~FGSearch ~FCsaved stories~FG by title: ~SB%s" % text)
|
|
q = FuzzyQuery('title', text)
|
|
results = cls.ES.search(q)
|
|
|
|
if not results.total:
|
|
logging.user(user, "~FGSearch ~FCsaved stories~FG by content: ~SB%s" % text)
|
|
q = FuzzyQuery('content', text)
|
|
results = cls.ES.search(q)
|
|
|
|
if not results.total:
|
|
logging.user(user, "~FGSearch ~FCsaved stories~FG by author: ~SB%s" % text)
|
|
q = FuzzyQuery('author', text)
|
|
results = cls.ES.search(q)
|
|
|
|
return results
|
|
|
|
|
|
class SearchFeed:
|
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
|
|
name = "feeds"
|
|
|
|
@classmethod
|
|
def create_elasticsearch_mapping(cls):
|
|
try:
|
|
cls.ES.delete_index("%s-index" % cls.name)
|
|
except pyes.TypeMissingException:
|
|
print "Index missing, can't delete: %s-index" % cls.name
|
|
|
|
settings = {
|
|
"index" : {
|
|
"analysis" : {
|
|
"analyzer" : {
|
|
"url_analyzer" : {
|
|
"type" : "custom",
|
|
"tokenizer" : "urls",
|
|
"filter" : ["stop", "url_stop"]
|
|
}
|
|
},
|
|
"tokenizer": {
|
|
"urls": {
|
|
"type": "uax_url_email",
|
|
"max_token_length": 255,
|
|
}
|
|
},
|
|
"filter" : {
|
|
"url_stop" : {
|
|
"type" : "stop",
|
|
"stopwords" : ["http", "https"]
|
|
},
|
|
"url_ngram" : {
|
|
"type" : "nGram",
|
|
"min_gram" : 2,
|
|
"max_gram" : 20,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
cls.ES.create_index("%s-index" % cls.name, settings)
|
|
mapping = {
|
|
'address': {
|
|
'boost': 3.0,
|
|
'index': 'analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
"term_vector" : "with_positions_offsets",
|
|
"analyzer": "url_analyzer",
|
|
},
|
|
'title': {
|
|
'boost': 2.0,
|
|
'index': 'analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
"term_vector" : "with_positions_offsets",
|
|
},
|
|
'link': {
|
|
'boost': 1.0,
|
|
'index': 'analyzed',
|
|
'store': 'yes',
|
|
'type': 'string',
|
|
"term_vector" : "with_positions_offsets",
|
|
"analyzer": "url_analyzer",
|
|
},
|
|
'num_subscribers': {
|
|
'boost': 1.0,
|
|
'index': 'not_analyzed',
|
|
'store': 'yes',
|
|
'type': 'integer',
|
|
},
|
|
'feed_id': {
|
|
'store': 'yes',
|
|
'type': 'integer',
|
|
},
|
|
}
|
|
cls.ES.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
|
|
|
@classmethod
|
|
def index(cls, feed_id, title, address, link, num_subscribers):
|
|
doc = {
|
|
"feed_id": feed_id,
|
|
"title": title,
|
|
"address": address,
|
|
"link": link,
|
|
"num_subscribers": num_subscribers,
|
|
}
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, feed_id)
|
|
|
|
@classmethod
|
|
def query(cls, text):
|
|
cls.ES.refresh()
|
|
|
|
sub_filter = RangeFilter(qrange=ESRange('num_subscribers', 2))
|
|
logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text)
|
|
q = TextQuery('address', text)
|
|
results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
|
|
|
|
if not results.total:
|
|
logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text)
|
|
q = PrefixQuery('title', text)
|
|
results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
|
|
|
|
if not results.total:
|
|
logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text)
|
|
q = TextQuery('link.partial', text)
|
|
results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
|
|
|
|
return results
|