Re-normalizing dense vectors from 1584 dimensions to 256 dimensions, applying bbq_hnsw index to save 96% memory usage on vector embeddings.

2025-04-13 09:42:01 +00:00 · 2024-12-24 19:40:48 -05:00 · 2024-12-24 19:40:48 -05:00 · 5f02400567
commit 5f02400567
parent be7380a0a4
8 changed files with 127 additions and 23 deletions
--- a/ansible/roles/elasticsearch/tasks/main.yml
+++ b/ansible/roles/elasticsearch/tasks/main.yml
@ -35,16 +35,16 @@
    name: vm.max_map_count
    value: "262144"
    state: present
-    
- name: Start Elasticsearch Docker container
+
+- name: Start Elasticsearch v7 Docker container
  docker_container:
    name: elasticsearch
    image: elasticsearch:7.14.0
    state: started
    hostname: "{{ inventory_hostname }}"
    ports:
-      - '9200:9200'
-      - '9300:9300'
+      - "9200:9200"
+      - "9300:9300"
    restart_policy: unless-stopped
    container_default_behavior: no_defaults
    networks_cli_compatible: yes
@ -52,7 +52,7 @@
    network_mode: default
    networks:
      - name: newsblurnet
-        aliases: 
+        aliases:
          - elasticsearch
    user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
    volumes:
@ -64,8 +64,49 @@
  tags: consul
  become: yes
  template:
-    src: consul_service.json
+    src: consul_service.json.j2
    dest: /etc/consul.d/elasticsearch.json
+  vars:
+    consul_service_name: elasticsearch
+    consul_service_port: 9200
+    elasticsearch_version: 7
+  notify:
+    - reload consul
+
+- name: Start Elasticsearch v8 Docker container
+  docker_container:
+    name: elasticsearch
+    image: elasticsearch:8.17.0
+    state: started
+    hostname: "{{ inventory_hostname }}"
+    ports:
+      - "9208:9200"
+      - "9308:9300"
+    restart_policy: unless-stopped
+    container_default_behavior: no_defaults
+    networks_cli_compatible: yes
+    # network_mode: host
+    network_mode: default
+    networks:
+      - name: newsblurnet
+        aliases:
+          - elasticsearch
+    user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
+    volumes:
+      - /srv/newsblur/docker/volumes/elasticsearch8:/usr/share/elasticsearch/data
+      - /var/log/elasticsearch8/:/var/log/elasticsearch/
+      - /srv/newsblur/config/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
+
+- name: Register elasticsearch in consul
+  tags: consul
+  become: yes
+  template:
+    src: consul_service.json.j2
+    dest: /etc/consul.d/elasticsearch.json
+  vars:
+    consul_service_name: elasticsearch
+    consul_service_port: 9208
+    elasticsearch_version: 8
  notify:
    - reload consul

--- a/ansible/roles/elasticsearch/templates/consul_service.json
+++ b/ansible/roles/elasticsearch/templates/consul_service.json
@ -1,14 +1,18 @@
 {
    "service": {
-        {% if not elasticsearch_secondary %}
-        "name": "db-elasticsearch",
-        {% else %}
-        "name": "db-elasticsearch-staging",
+        {% if elasticsearch_version == 7 %}
+            {% if not elasticsearch_secondary %}
+            "name": "db-elasticsearch",
+            {% else %}
+            "name": "db-elasticsearch-staging",
+            {% endif %}
+        {% elif elasticsearch_version == 8 %}
+            "name": "db-elasticsearch-v8",
        {% endif %}
        "tags": [
            "db"
        ],
-        "port": 9200,
+        "port": {{ consul_service_port }},
        "checks": [{
            "id": "es-ping",
            "http": "http://{{ ansible_host }}:5579/db_check/elasticsearch?consul=1",
--- a/apps/rss_feeds/tasks.py
+++ b/apps/rss_feeds/tasks.py
@ -8,7 +8,6 @@ from celery.exceptions import SoftTimeLimitExceeded
 from django.conf import settings

 from apps.profile.middleware import DBProfilerMiddleware
-from apps.search.models import DiscoverStory
 from newsblur_web.celeryapp import app
 from utils import log as logging
 from utils.redis_raw_log_middleware import RedisDumpMiddleware
--- a/apps/search/models.py
+++ b/apps/search/models.py
@ -15,6 +15,7 @@ from django.conf import settings
 from django.contrib.auth.models import User
 from openai import APITimeoutError, OpenAI

+from apps.search.projection_matrix import project_vector
 from apps.search.tasks import (
    FinishIndexSubscriptionsForDiscover,
    FinishIndexSubscriptionsForSearch,
@ -276,7 +277,7 @@ class MUserSearch(mongo.Document):
            feed.index_stories_for_search()

    @classmethod
-    def remove_all(cls, drop_index=False):
+    def remove_all(cls, drop_index=False, search=True, discover=True):
        # You only need to drop the index if there is data you want to clear.
        # A new search server won't need this, as there isn't anything to drop.
        if drop_index:
@ -287,11 +288,11 @@ class MUserSearch(mongo.Document):
        logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
        for user_search in user_searches:
            try:
-                user_search.remove()
+                user_search.remove(search=search, discover=discover)
            except Exception as e:
                print(" ****> Error on search removal: %s" % e)

-    def remove(self):
+    def remove(self, search=True, discover=True):
        from apps.reader.models import UserSubscription
        from apps.rss_feeds.models import Feed

@ -305,16 +306,24 @@ class MUserSearch(mongo.Document):
                feed = sub.feed
            except Feed.DoesNotExist:
                continue
-            if not feed.search_indexed:
+            if search and not discover and not feed.search_indexed:
                continue
-            feed.search_indexed = False
+            if discover and not search and not feed.discover_indexed:
+                continue
+            if search and discover and not feed.search_indexed and not feed.discover_indexed:
+                continue
+            if search:
+                feed.search_indexed = False
+                feed.search_indexing = False
+            if discover:
+                feed.discover_indexed = False
+                feed.discover_indexing = False
            feed.save()
            removed += 1

        logging.user(
            user,
-            "~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN."
-            % (removed, total, user.username),
+            f"~FCRemoved ~SB{removed}/{total} feed's {'search' if search and not discover else 'discover' if discover and not search else 'search+discover' if search and discover else 'neither'} indexes~SN for ~SB~FB{user.username}~FC~SN.",
        )
        self.delete()

@ -705,8 +714,9 @@ class DiscoverStory:
            },
            "content_vector": {
                "type": "dense_vector",
-                "dims": 1536,  # Numbers of dims from text-embedding-3-small
-                # "store": True,  # Keep stored since we need to retrieve it # No need to be explicit
+                "dims": 256,  # Reduced from openai embedding size of 1536 to 256
+                "index": True,
+                "index_options": {"type": "bbq_hnsw"},  # Use bbq_hnsw index options for faster search
            },
        }

@ -947,7 +957,10 @@ class DiscoverStory:
            return []
        story_embedding = response.data[0].embedding

-        return story_embedding
+        # Project the embedding down to 256 dimensions
+        projected_embedding = project_vector(story_embedding)
+
+        return projected_embedding.tolist()

    @classmethod
    def debug_index(cls, show_data=True, show_source=False):
--- a/apps/search/projection_matrix.py
+++ b/apps/search/projection_matrix.py
@ -0,0 +1,37 @@
+import os
+
+import numpy as np
+
+PROJECTION_MATRIX_PATH = os.path.join(os.path.dirname(__file__), "random_projection_matrix.npy")
+INPUT_DIMS = 1536
+OUTPUT_DIMS = 256
+
+
+def generate_projection_matrix():
+    """Generate a random projection matrix for dimensionality reduction."""
+    # Use a fixed random seed for reproducibility
+    np.random.seed(42)
+
+    # Generate random matrix
+    projection = np.random.normal(0, 1 / np.sqrt(OUTPUT_DIMS), (OUTPUT_DIMS, INPUT_DIMS))
+
+    # Normalize the matrix
+    projection = projection / np.linalg.norm(projection, axis=1)[:, np.newaxis]
+
+    return projection
+
+
+def get_projection_matrix():
+    """Get the projection matrix, generating it if it doesn't exist."""
+    if not os.path.exists(PROJECTION_MATRIX_PATH):
+        projection = generate_projection_matrix()
+        np.save(PROJECTION_MATRIX_PATH, projection)
+    return np.load(PROJECTION_MATRIX_PATH)
+
+
+def project_vector(vector):
+    """Project a vector from 1536 dimensions to 256 dimensions."""
+    projection = get_projection_matrix()
+    projected = np.dot(projection, vector)
+    # Normalize the projected vector
+    return projected / np.linalg.norm(projected)
--- a/apps/search/random_projection_matrix.npy
+++ b/apps/search/random_projection_matrix.npy
--- a/newsblur_web/docker_local_settings.py
+++ b/newsblur_web/docker_local_settings.py
@ -129,10 +129,11 @@ SESSION_REDIS_DB = 5

 ELASTICSEARCH_FEED_HOSTS = ["db_elasticsearch:9200"]
 ELASTICSEARCH_STORY_HOSTS = ["db_elasticsearch:9200"]
+ELASTICSEARCH_DISCOVER_HOSTS = ["db_elasticsearch:9200"]

 ELASTICSEARCH_FEED_HOST = "http://db_elasticsearch:9200"
 ELASTICSEARCH_STORY_HOST = "http://db_elasticsearch:9200"
-
+ELASTICSEARCH_DISCOVER_HOST = "http://db_elasticsearch:9200"
 BACKED_BY_AWS = {
    "pages_on_node": False,
    "pages_on_s3": False,
--- a/newsblur_web/settings.py
+++ b/newsblur_web/settings.py
@ -487,6 +487,7 @@ CELERY_BEAT_SCHEDULE = {
 # =========
 # = Mongo =
 # =========
+
 if DOCKERBUILD:
    MONGO_PORT = 29019
 else:
@ -500,6 +501,14 @@ MONGO_ANALYTICS_DB = {
    "name": "nbanalytics",
 }

+# =================
+# = Elasticsearch =
+# =================
+
+ELASTICSEARCH_FEED_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
+ELASTICSEARCH_STORY_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
+ELASTICSEARCH_DISCOVER_HOST = "http://db-elasticsearch-v8.service.nyc1.consul:9208"
+
 # ====================
 # = Database Routers =
 # ====================