mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
Re-normalizing dense vectors from 1584 dimensions to 256 dimensions, applying bbq_hnsw index to save 96% memory usage on vector embeddings.
This commit is contained in:
parent
be7380a0a4
commit
5f02400567
8 changed files with 127 additions and 23 deletions
|
@ -35,16 +35,16 @@
|
|||
name: vm.max_map_count
|
||||
value: "262144"
|
||||
state: present
|
||||
|
||||
- name: Start Elasticsearch Docker container
|
||||
|
||||
- name: Start Elasticsearch v7 Docker container
|
||||
docker_container:
|
||||
name: elasticsearch
|
||||
image: elasticsearch:7.14.0
|
||||
state: started
|
||||
hostname: "{{ inventory_hostname }}"
|
||||
ports:
|
||||
- '9200:9200'
|
||||
- '9300:9300'
|
||||
- "9200:9200"
|
||||
- "9300:9300"
|
||||
restart_policy: unless-stopped
|
||||
container_default_behavior: no_defaults
|
||||
networks_cli_compatible: yes
|
||||
|
@ -52,7 +52,7 @@
|
|||
network_mode: default
|
||||
networks:
|
||||
- name: newsblurnet
|
||||
aliases:
|
||||
aliases:
|
||||
- elasticsearch
|
||||
user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
|
||||
volumes:
|
||||
|
@ -64,8 +64,49 @@
|
|||
tags: consul
|
||||
become: yes
|
||||
template:
|
||||
src: consul_service.json
|
||||
src: consul_service.json.j2
|
||||
dest: /etc/consul.d/elasticsearch.json
|
||||
vars:
|
||||
consul_service_name: elasticsearch
|
||||
consul_service_port: 9200
|
||||
elasticsearch_version: 7
|
||||
notify:
|
||||
- reload consul
|
||||
|
||||
- name: Start Elasticsearch v8 Docker container
|
||||
docker_container:
|
||||
name: elasticsearch
|
||||
image: elasticsearch:8.17.0
|
||||
state: started
|
||||
hostname: "{{ inventory_hostname }}"
|
||||
ports:
|
||||
- "9208:9200"
|
||||
- "9308:9300"
|
||||
restart_policy: unless-stopped
|
||||
container_default_behavior: no_defaults
|
||||
networks_cli_compatible: yes
|
||||
# network_mode: host
|
||||
network_mode: default
|
||||
networks:
|
||||
- name: newsblurnet
|
||||
aliases:
|
||||
- elasticsearch
|
||||
user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
|
||||
volumes:
|
||||
- /srv/newsblur/docker/volumes/elasticsearch8:/usr/share/elasticsearch/data
|
||||
- /var/log/elasticsearch8/:/var/log/elasticsearch/
|
||||
- /srv/newsblur/config/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
|
||||
|
||||
- name: Register elasticsearch in consul
|
||||
tags: consul
|
||||
become: yes
|
||||
template:
|
||||
src: consul_service.json.j2
|
||||
dest: /etc/consul.d/elasticsearch.json
|
||||
vars:
|
||||
consul_service_name: elasticsearch
|
||||
consul_service_port: 9208
|
||||
elasticsearch_version: 8
|
||||
notify:
|
||||
- reload consul
|
||||
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"service": {
|
||||
{% if not elasticsearch_secondary %}
|
||||
"name": "db-elasticsearch",
|
||||
{% else %}
|
||||
"name": "db-elasticsearch-staging",
|
||||
{% if elasticsearch_version == 7 %}
|
||||
{% if not elasticsearch_secondary %}
|
||||
"name": "db-elasticsearch",
|
||||
{% else %}
|
||||
"name": "db-elasticsearch-staging",
|
||||
{% endif %}
|
||||
{% elif elasticsearch_version == 8 %}
|
||||
"name": "db-elasticsearch-v8",
|
||||
{% endif %}
|
||||
"tags": [
|
||||
"db"
|
||||
],
|
||||
"port": 9200,
|
||||
"port": {{ consul_service_port }},
|
||||
"checks": [{
|
||||
"id": "es-ping",
|
||||
"http": "http://{{ ansible_host }}:5579/db_check/elasticsearch?consul=1",
|
||||
|
|
|
@ -8,7 +8,6 @@ from celery.exceptions import SoftTimeLimitExceeded
|
|||
from django.conf import settings
|
||||
|
||||
from apps.profile.middleware import DBProfilerMiddleware
|
||||
from apps.search.models import DiscoverStory
|
||||
from newsblur_web.celeryapp import app
|
||||
from utils import log as logging
|
||||
from utils.redis_raw_log_middleware import RedisDumpMiddleware
|
||||
|
|
|
@ -15,6 +15,7 @@ from django.conf import settings
|
|||
from django.contrib.auth.models import User
|
||||
from openai import APITimeoutError, OpenAI
|
||||
|
||||
from apps.search.projection_matrix import project_vector
|
||||
from apps.search.tasks import (
|
||||
FinishIndexSubscriptionsForDiscover,
|
||||
FinishIndexSubscriptionsForSearch,
|
||||
|
@ -276,7 +277,7 @@ class MUserSearch(mongo.Document):
|
|||
feed.index_stories_for_search()
|
||||
|
||||
@classmethod
|
||||
def remove_all(cls, drop_index=False):
|
||||
def remove_all(cls, drop_index=False, search=True, discover=True):
|
||||
# You only need to drop the index if there is data you want to clear.
|
||||
# A new search server won't need this, as there isn't anything to drop.
|
||||
if drop_index:
|
||||
|
@ -287,11 +288,11 @@ class MUserSearch(mongo.Document):
|
|||
logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
|
||||
for user_search in user_searches:
|
||||
try:
|
||||
user_search.remove()
|
||||
user_search.remove(search=search, discover=discover)
|
||||
except Exception as e:
|
||||
print(" ****> Error on search removal: %s" % e)
|
||||
|
||||
def remove(self):
|
||||
def remove(self, search=True, discover=True):
|
||||
from apps.reader.models import UserSubscription
|
||||
from apps.rss_feeds.models import Feed
|
||||
|
||||
|
@ -305,16 +306,24 @@ class MUserSearch(mongo.Document):
|
|||
feed = sub.feed
|
||||
except Feed.DoesNotExist:
|
||||
continue
|
||||
if not feed.search_indexed:
|
||||
if search and not discover and not feed.search_indexed:
|
||||
continue
|
||||
feed.search_indexed = False
|
||||
if discover and not search and not feed.discover_indexed:
|
||||
continue
|
||||
if search and discover and not feed.search_indexed and not feed.discover_indexed:
|
||||
continue
|
||||
if search:
|
||||
feed.search_indexed = False
|
||||
feed.search_indexing = False
|
||||
if discover:
|
||||
feed.discover_indexed = False
|
||||
feed.discover_indexing = False
|
||||
feed.save()
|
||||
removed += 1
|
||||
|
||||
logging.user(
|
||||
user,
|
||||
"~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN."
|
||||
% (removed, total, user.username),
|
||||
f"~FCRemoved ~SB{removed}/{total} feed's {'search' if search and not discover else 'discover' if discover and not search else 'search+discover' if search and discover else 'neither'} indexes~SN for ~SB~FB{user.username}~FC~SN.",
|
||||
)
|
||||
self.delete()
|
||||
|
||||
|
@ -705,8 +714,9 @@ class DiscoverStory:
|
|||
},
|
||||
"content_vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 1536, # Numbers of dims from text-embedding-3-small
|
||||
# "store": True, # Keep stored since we need to retrieve it # No need to be explicit
|
||||
"dims": 256, # Reduced from openai embedding size of 1536 to 256
|
||||
"index": True,
|
||||
"index_options": {"type": "bbq_hnsw"}, # Use bbq_hnsw index options for faster search
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -947,7 +957,10 @@ class DiscoverStory:
|
|||
return []
|
||||
story_embedding = response.data[0].embedding
|
||||
|
||||
return story_embedding
|
||||
# Project the embedding down to 256 dimensions
|
||||
projected_embedding = project_vector(story_embedding)
|
||||
|
||||
return projected_embedding.tolist()
|
||||
|
||||
@classmethod
|
||||
def debug_index(cls, show_data=True, show_source=False):
|
||||
|
|
37
apps/search/projection_matrix.py
Normal file
37
apps/search/projection_matrix.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
PROJECTION_MATRIX_PATH = os.path.join(os.path.dirname(__file__), "random_projection_matrix.npy")
|
||||
INPUT_DIMS = 1536
|
||||
OUTPUT_DIMS = 256
|
||||
|
||||
|
||||
def generate_projection_matrix():
|
||||
"""Generate a random projection matrix for dimensionality reduction."""
|
||||
# Use a fixed random seed for reproducibility
|
||||
np.random.seed(42)
|
||||
|
||||
# Generate random matrix
|
||||
projection = np.random.normal(0, 1 / np.sqrt(OUTPUT_DIMS), (OUTPUT_DIMS, INPUT_DIMS))
|
||||
|
||||
# Normalize the matrix
|
||||
projection = projection / np.linalg.norm(projection, axis=1)[:, np.newaxis]
|
||||
|
||||
return projection
|
||||
|
||||
|
||||
def get_projection_matrix():
|
||||
"""Get the projection matrix, generating it if it doesn't exist."""
|
||||
if not os.path.exists(PROJECTION_MATRIX_PATH):
|
||||
projection = generate_projection_matrix()
|
||||
np.save(PROJECTION_MATRIX_PATH, projection)
|
||||
return np.load(PROJECTION_MATRIX_PATH)
|
||||
|
||||
|
||||
def project_vector(vector):
|
||||
"""Project a vector from 1536 dimensions to 256 dimensions."""
|
||||
projection = get_projection_matrix()
|
||||
projected = np.dot(projection, vector)
|
||||
# Normalize the projected vector
|
||||
return projected / np.linalg.norm(projected)
|
BIN
apps/search/random_projection_matrix.npy
Normal file
BIN
apps/search/random_projection_matrix.npy
Normal file
Binary file not shown.
|
@ -129,10 +129,11 @@ SESSION_REDIS_DB = 5
|
|||
|
||||
ELASTICSEARCH_FEED_HOSTS = ["db_elasticsearch:9200"]
|
||||
ELASTICSEARCH_STORY_HOSTS = ["db_elasticsearch:9200"]
|
||||
ELASTICSEARCH_DISCOVER_HOSTS = ["db_elasticsearch:9200"]
|
||||
|
||||
ELASTICSEARCH_FEED_HOST = "http://db_elasticsearch:9200"
|
||||
ELASTICSEARCH_STORY_HOST = "http://db_elasticsearch:9200"
|
||||
|
||||
ELASTICSEARCH_DISCOVER_HOST = "http://db_elasticsearch:9200"
|
||||
BACKED_BY_AWS = {
|
||||
"pages_on_node": False,
|
||||
"pages_on_s3": False,
|
||||
|
|
|
@ -487,6 +487,7 @@ CELERY_BEAT_SCHEDULE = {
|
|||
# =========
|
||||
# = Mongo =
|
||||
# =========
|
||||
|
||||
if DOCKERBUILD:
|
||||
MONGO_PORT = 29019
|
||||
else:
|
||||
|
@ -500,6 +501,14 @@ MONGO_ANALYTICS_DB = {
|
|||
"name": "nbanalytics",
|
||||
}
|
||||
|
||||
# =================
|
||||
# = Elasticsearch =
|
||||
# =================
|
||||
|
||||
ELASTICSEARCH_FEED_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
|
||||
ELASTICSEARCH_STORY_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
|
||||
ELASTICSEARCH_DISCOVER_HOST = "http://db-elasticsearch-v8.service.nyc1.consul:9208"
|
||||
|
||||
# ====================
|
||||
# = Database Routers =
|
||||
# ====================
|
||||
|
|
Loading…
Add table
Reference in a new issue