Re-normalizing dense vectors from 1584 dimensions to 256 dimensions, applying bbq_hnsw index to save 96% memory usage on vector embeddings.

This commit is contained in:
Samuel Clay 2024-12-24 19:40:48 -05:00
parent be7380a0a4
commit 5f02400567
8 changed files with 127 additions and 23 deletions

View file

@ -35,16 +35,16 @@
name: vm.max_map_count
value: "262144"
state: present
- name: Start Elasticsearch Docker container
- name: Start Elasticsearch v7 Docker container
docker_container:
name: elasticsearch
image: elasticsearch:7.14.0
state: started
hostname: "{{ inventory_hostname }}"
ports:
- '9200:9200'
- '9300:9300'
- "9200:9200"
- "9300:9300"
restart_policy: unless-stopped
container_default_behavior: no_defaults
networks_cli_compatible: yes
@ -52,7 +52,7 @@
network_mode: default
networks:
- name: newsblurnet
aliases:
aliases:
- elasticsearch
user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
volumes:
@ -64,8 +64,49 @@
tags: consul
become: yes
template:
src: consul_service.json
src: consul_service.json.j2
dest: /etc/consul.d/elasticsearch.json
vars:
consul_service_name: elasticsearch
consul_service_port: 9200
elasticsearch_version: 7
notify:
- reload consul
- name: Start Elasticsearch v8 Docker container
docker_container:
name: elasticsearch
image: elasticsearch:8.17.0
state: started
hostname: "{{ inventory_hostname }}"
ports:
- "9208:9200"
- "9308:9300"
restart_policy: unless-stopped
container_default_behavior: no_defaults
networks_cli_compatible: yes
# network_mode: host
network_mode: default
networks:
- name: newsblurnet
aliases:
- elasticsearch
user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
volumes:
- /srv/newsblur/docker/volumes/elasticsearch8:/usr/share/elasticsearch/data
- /var/log/elasticsearch8/:/var/log/elasticsearch/
- /srv/newsblur/config/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
- name: Register elasticsearch in consul
tags: consul
become: yes
template:
src: consul_service.json.j2
dest: /etc/consul.d/elasticsearch.json
vars:
consul_service_name: elasticsearch
consul_service_port: 9208
elasticsearch_version: 8
notify:
- reload consul

View file

@ -1,14 +1,18 @@
{
"service": {
{% if not elasticsearch_secondary %}
"name": "db-elasticsearch",
{% else %}
"name": "db-elasticsearch-staging",
{% if elasticsearch_version == 7 %}
{% if not elasticsearch_secondary %}
"name": "db-elasticsearch",
{% else %}
"name": "db-elasticsearch-staging",
{% endif %}
{% elif elasticsearch_version == 8 %}
"name": "db-elasticsearch-v8",
{% endif %}
"tags": [
"db"
],
"port": 9200,
"port": {{ consul_service_port }},
"checks": [{
"id": "es-ping",
"http": "http://{{ ansible_host }}:5579/db_check/elasticsearch?consul=1",

View file

@ -8,7 +8,6 @@ from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings
from apps.profile.middleware import DBProfilerMiddleware
from apps.search.models import DiscoverStory
from newsblur_web.celeryapp import app
from utils import log as logging
from utils.redis_raw_log_middleware import RedisDumpMiddleware

View file

@ -15,6 +15,7 @@ from django.conf import settings
from django.contrib.auth.models import User
from openai import APITimeoutError, OpenAI
from apps.search.projection_matrix import project_vector
from apps.search.tasks import (
FinishIndexSubscriptionsForDiscover,
FinishIndexSubscriptionsForSearch,
@ -276,7 +277,7 @@ class MUserSearch(mongo.Document):
feed.index_stories_for_search()
@classmethod
def remove_all(cls, drop_index=False):
def remove_all(cls, drop_index=False, search=True, discover=True):
# You only need to drop the index if there is data you want to clear.
# A new search server won't need this, as there isn't anything to drop.
if drop_index:
@ -287,11 +288,11 @@ class MUserSearch(mongo.Document):
logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
for user_search in user_searches:
try:
user_search.remove()
user_search.remove(search=search, discover=discover)
except Exception as e:
print(" ****> Error on search removal: %s" % e)
def remove(self):
def remove(self, search=True, discover=True):
from apps.reader.models import UserSubscription
from apps.rss_feeds.models import Feed
@ -305,16 +306,24 @@ class MUserSearch(mongo.Document):
feed = sub.feed
except Feed.DoesNotExist:
continue
if not feed.search_indexed:
if search and not discover and not feed.search_indexed:
continue
feed.search_indexed = False
if discover and not search and not feed.discover_indexed:
continue
if search and discover and not feed.search_indexed and not feed.discover_indexed:
continue
if search:
feed.search_indexed = False
feed.search_indexing = False
if discover:
feed.discover_indexed = False
feed.discover_indexing = False
feed.save()
removed += 1
logging.user(
user,
"~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN."
% (removed, total, user.username),
f"~FCRemoved ~SB{removed}/{total} feed's {'search' if search and not discover else 'discover' if discover and not search else 'search+discover' if search and discover else 'neither'} indexes~SN for ~SB~FB{user.username}~FC~SN.",
)
self.delete()
@ -705,8 +714,9 @@ class DiscoverStory:
},
"content_vector": {
"type": "dense_vector",
"dims": 1536, # Numbers of dims from text-embedding-3-small
# "store": True, # Keep stored since we need to retrieve it # No need to be explicit
"dims": 256, # Reduced from openai embedding size of 1536 to 256
"index": True,
"index_options": {"type": "bbq_hnsw"}, # Use bbq_hnsw index options for faster search
},
}
@ -947,7 +957,10 @@ class DiscoverStory:
return []
story_embedding = response.data[0].embedding
return story_embedding
# Project the embedding down to 256 dimensions
projected_embedding = project_vector(story_embedding)
return projected_embedding.tolist()
@classmethod
def debug_index(cls, show_data=True, show_source=False):

View file

@ -0,0 +1,37 @@
import os
import numpy as np
PROJECTION_MATRIX_PATH = os.path.join(os.path.dirname(__file__), "random_projection_matrix.npy")
INPUT_DIMS = 1536
OUTPUT_DIMS = 256
def generate_projection_matrix():
"""Generate a random projection matrix for dimensionality reduction."""
# Use a fixed random seed for reproducibility
np.random.seed(42)
# Generate random matrix
projection = np.random.normal(0, 1 / np.sqrt(OUTPUT_DIMS), (OUTPUT_DIMS, INPUT_DIMS))
# Normalize the matrix
projection = projection / np.linalg.norm(projection, axis=1)[:, np.newaxis]
return projection
def get_projection_matrix():
"""Get the projection matrix, generating it if it doesn't exist."""
if not os.path.exists(PROJECTION_MATRIX_PATH):
projection = generate_projection_matrix()
np.save(PROJECTION_MATRIX_PATH, projection)
return np.load(PROJECTION_MATRIX_PATH)
def project_vector(vector):
"""Project a vector from 1536 dimensions to 256 dimensions."""
projection = get_projection_matrix()
projected = np.dot(projection, vector)
# Normalize the projected vector
return projected / np.linalg.norm(projected)

Binary file not shown.

View file

@ -129,10 +129,11 @@ SESSION_REDIS_DB = 5
ELASTICSEARCH_FEED_HOSTS = ["db_elasticsearch:9200"]
ELASTICSEARCH_STORY_HOSTS = ["db_elasticsearch:9200"]
ELASTICSEARCH_DISCOVER_HOSTS = ["db_elasticsearch:9200"]
ELASTICSEARCH_FEED_HOST = "http://db_elasticsearch:9200"
ELASTICSEARCH_STORY_HOST = "http://db_elasticsearch:9200"
ELASTICSEARCH_DISCOVER_HOST = "http://db_elasticsearch:9200"
BACKED_BY_AWS = {
"pages_on_node": False,
"pages_on_s3": False,

View file

@ -487,6 +487,7 @@ CELERY_BEAT_SCHEDULE = {
# =========
# = Mongo =
# =========
if DOCKERBUILD:
MONGO_PORT = 29019
else:
@ -500,6 +501,14 @@ MONGO_ANALYTICS_DB = {
"name": "nbanalytics",
}
# =================
# = Elasticsearch =
# =================
ELASTICSEARCH_FEED_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
ELASTICSEARCH_STORY_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
ELASTICSEARCH_DISCOVER_HOST = "http://db-elasticsearch-v8.service.nyc1.consul:9208"
# ====================
# = Database Routers =
# ====================