diff --git a/ansible/roles/elasticsearch/tasks/main.yml b/ansible/roles/elasticsearch/tasks/main.yml index c7d207402..cd0e06932 100644 --- a/ansible/roles/elasticsearch/tasks/main.yml +++ b/ansible/roles/elasticsearch/tasks/main.yml @@ -35,16 +35,16 @@ name: vm.max_map_count value: "262144" state: present - -- name: Start Elasticsearch Docker container + +- name: Start Elasticsearch v7 Docker container docker_container: name: elasticsearch image: elasticsearch:7.14.0 state: started hostname: "{{ inventory_hostname }}" ports: - - '9200:9200' - - '9300:9300' + - "9200:9200" + - "9300:9300" restart_policy: unless-stopped container_default_behavior: no_defaults networks_cli_compatible: yes @@ -52,7 +52,7 @@ network_mode: default networks: - name: newsblurnet - aliases: + aliases: - elasticsearch user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}" volumes: @@ -64,8 +64,49 @@ tags: consul become: yes template: - src: consul_service.json + src: consul_service.json.j2 dest: /etc/consul.d/elasticsearch.json + vars: + consul_service_name: elasticsearch + consul_service_port: 9200 + elasticsearch_version: 7 + notify: + - reload consul + +- name: Start Elasticsearch v8 Docker container + docker_container: + name: elasticsearch + image: elasticsearch:8.17.0 + state: started + hostname: "{{ inventory_hostname }}" + ports: + - "9208:9200" + - "9308:9300" + restart_policy: unless-stopped + container_default_behavior: no_defaults + networks_cli_compatible: yes + # network_mode: host + network_mode: default + networks: + - name: newsblurnet + aliases: + - elasticsearch + user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}" + volumes: + - /srv/newsblur/docker/volumes/elasticsearch8:/usr/share/elasticsearch/data + - /var/log/elasticsearch8/:/var/log/elasticsearch/ + - /srv/newsblur/config/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml + +- name: Register elasticsearch in consul + tags: consul + become: yes + template: + src: consul_service.json.j2 + dest: /etc/consul.d/elasticsearch.json + vars: + consul_service_name: elasticsearch + consul_service_port: 9208 + elasticsearch_version: 8 notify: - reload consul diff --git a/ansible/roles/elasticsearch/templates/consul_service.json b/ansible/roles/elasticsearch/templates/consul_service.json index 782a6f485..28c5da280 100644 --- a/ansible/roles/elasticsearch/templates/consul_service.json +++ b/ansible/roles/elasticsearch/templates/consul_service.json @@ -1,14 +1,18 @@ { "service": { - {% if not elasticsearch_secondary %} - "name": "db-elasticsearch", - {% else %} - "name": "db-elasticsearch-staging", + {% if elasticsearch_version == 7 %} + {% if not elasticsearch_secondary %} + "name": "db-elasticsearch", + {% else %} + "name": "db-elasticsearch-staging", + {% endif %} + {% elif elasticsearch_version == 8 %} + "name": "db-elasticsearch-v8", {% endif %} "tags": [ "db" ], - "port": 9200, + "port": {{ consul_service_port }}, "checks": [{ "id": "es-ping", "http": "http://{{ ansible_host }}:5579/db_check/elasticsearch?consul=1", diff --git a/apps/rss_feeds/tasks.py b/apps/rss_feeds/tasks.py index 3ba85e27b..85cbfae6e 100644 --- a/apps/rss_feeds/tasks.py +++ b/apps/rss_feeds/tasks.py @@ -8,7 +8,6 @@ from celery.exceptions import SoftTimeLimitExceeded from django.conf import settings from apps.profile.middleware import DBProfilerMiddleware -from apps.search.models import DiscoverStory from newsblur_web.celeryapp import app from utils import log as logging from utils.redis_raw_log_middleware import RedisDumpMiddleware diff --git a/apps/search/models.py b/apps/search/models.py index 093216bd1..1008312b3 100644 --- a/apps/search/models.py +++ b/apps/search/models.py @@ -15,6 +15,7 @@ from django.conf import settings from django.contrib.auth.models import User from openai import APITimeoutError, OpenAI +from apps.search.projection_matrix import project_vector from apps.search.tasks import ( FinishIndexSubscriptionsForDiscover, FinishIndexSubscriptionsForSearch, @@ -276,7 +277,7 @@ class MUserSearch(mongo.Document): feed.index_stories_for_search() @classmethod - def remove_all(cls, drop_index=False): + def remove_all(cls, drop_index=False, search=True, discover=True): # You only need to drop the index if there is data you want to clear. # A new search server won't need this, as there isn't anything to drop. if drop_index: @@ -287,11 +288,11 @@ class MUserSearch(mongo.Document): logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count()) for user_search in user_searches: try: - user_search.remove() + user_search.remove(search=search, discover=discover) except Exception as e: print(" ****> Error on search removal: %s" % e) - def remove(self): + def remove(self, search=True, discover=True): from apps.reader.models import UserSubscription from apps.rss_feeds.models import Feed @@ -305,16 +306,24 @@ class MUserSearch(mongo.Document): feed = sub.feed except Feed.DoesNotExist: continue - if not feed.search_indexed: + if search and not discover and not feed.search_indexed: continue - feed.search_indexed = False + if discover and not search and not feed.discover_indexed: + continue + if search and discover and not feed.search_indexed and not feed.discover_indexed: + continue + if search: + feed.search_indexed = False + feed.search_indexing = False + if discover: + feed.discover_indexed = False + feed.discover_indexing = False feed.save() removed += 1 logging.user( user, - "~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN." - % (removed, total, user.username), + f"~FCRemoved ~SB{removed}/{total} feed's {'search' if search and not discover else 'discover' if discover and not search else 'search+discover' if search and discover else 'neither'} indexes~SN for ~SB~FB{user.username}~FC~SN.", ) self.delete() @@ -705,8 +714,9 @@ class DiscoverStory: }, "content_vector": { "type": "dense_vector", - "dims": 1536, # Numbers of dims from text-embedding-3-small - # "store": True, # Keep stored since we need to retrieve it # No need to be explicit + "dims": 256, # Reduced from openai embedding size of 1536 to 256 + "index": True, + "index_options": {"type": "bbq_hnsw"}, # Use bbq_hnsw index options for faster search }, } @@ -947,7 +957,10 @@ class DiscoverStory: return [] story_embedding = response.data[0].embedding - return story_embedding + # Project the embedding down to 256 dimensions + projected_embedding = project_vector(story_embedding) + + return projected_embedding.tolist() @classmethod def debug_index(cls, show_data=True, show_source=False): diff --git a/apps/search/projection_matrix.py b/apps/search/projection_matrix.py new file mode 100644 index 000000000..4c4b76614 --- /dev/null +++ b/apps/search/projection_matrix.py @@ -0,0 +1,37 @@ +import os + +import numpy as np + +PROJECTION_MATRIX_PATH = os.path.join(os.path.dirname(__file__), "random_projection_matrix.npy") +INPUT_DIMS = 1536 +OUTPUT_DIMS = 256 + + +def generate_projection_matrix(): + """Generate a random projection matrix for dimensionality reduction.""" + # Use a fixed random seed for reproducibility + np.random.seed(42) + + # Generate random matrix + projection = np.random.normal(0, 1 / np.sqrt(OUTPUT_DIMS), (OUTPUT_DIMS, INPUT_DIMS)) + + # Normalize the matrix + projection = projection / np.linalg.norm(projection, axis=1)[:, np.newaxis] + + return projection + + +def get_projection_matrix(): + """Get the projection matrix, generating it if it doesn't exist.""" + if not os.path.exists(PROJECTION_MATRIX_PATH): + projection = generate_projection_matrix() + np.save(PROJECTION_MATRIX_PATH, projection) + return np.load(PROJECTION_MATRIX_PATH) + + +def project_vector(vector): + """Project a vector from 1536 dimensions to 256 dimensions.""" + projection = get_projection_matrix() + projected = np.dot(projection, vector) + # Normalize the projected vector + return projected / np.linalg.norm(projected) diff --git a/apps/search/random_projection_matrix.npy b/apps/search/random_projection_matrix.npy new file mode 100644 index 000000000..1dd99aed0 Binary files /dev/null and b/apps/search/random_projection_matrix.npy differ diff --git a/newsblur_web/docker_local_settings.py b/newsblur_web/docker_local_settings.py index 5e193fc26..3c8ddf850 100644 --- a/newsblur_web/docker_local_settings.py +++ b/newsblur_web/docker_local_settings.py @@ -129,10 +129,11 @@ SESSION_REDIS_DB = 5 ELASTICSEARCH_FEED_HOSTS = ["db_elasticsearch:9200"] ELASTICSEARCH_STORY_HOSTS = ["db_elasticsearch:9200"] +ELASTICSEARCH_DISCOVER_HOSTS = ["db_elasticsearch:9200"] ELASTICSEARCH_FEED_HOST = "http://db_elasticsearch:9200" ELASTICSEARCH_STORY_HOST = "http://db_elasticsearch:9200" - +ELASTICSEARCH_DISCOVER_HOST = "http://db_elasticsearch:9200" BACKED_BY_AWS = { "pages_on_node": False, "pages_on_s3": False, diff --git a/newsblur_web/settings.py b/newsblur_web/settings.py index dff722bfa..2cb645adf 100644 --- a/newsblur_web/settings.py +++ b/newsblur_web/settings.py @@ -487,6 +487,7 @@ CELERY_BEAT_SCHEDULE = { # ========= # = Mongo = # ========= + if DOCKERBUILD: MONGO_PORT = 29019 else: @@ -500,6 +501,14 @@ MONGO_ANALYTICS_DB = { "name": "nbanalytics", } +# ================= +# = Elasticsearch = +# ================= + +ELASTICSEARCH_FEED_HOST = "http://db-elasticsearch.service.nyc1.consul:9200" +ELASTICSEARCH_STORY_HOST = "http://db-elasticsearch.service.nyc1.consul:9200" +ELASTICSEARCH_DISCOVER_HOST = "http://db-elasticsearch-v8.service.nyc1.consul:9208" + # ==================== # = Database Routers = # ====================