mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
Re-normalizing dense vectors from 1584 dimensions to 256 dimensions, applying bbq_hnsw index to save 96% memory usage on vector embeddings.
This commit is contained in:
parent
be7380a0a4
commit
5f02400567
8 changed files with 127 additions and 23 deletions
|
@ -36,15 +36,15 @@
|
||||||
value: "262144"
|
value: "262144"
|
||||||
state: present
|
state: present
|
||||||
|
|
||||||
- name: Start Elasticsearch Docker container
|
- name: Start Elasticsearch v7 Docker container
|
||||||
docker_container:
|
docker_container:
|
||||||
name: elasticsearch
|
name: elasticsearch
|
||||||
image: elasticsearch:7.14.0
|
image: elasticsearch:7.14.0
|
||||||
state: started
|
state: started
|
||||||
hostname: "{{ inventory_hostname }}"
|
hostname: "{{ inventory_hostname }}"
|
||||||
ports:
|
ports:
|
||||||
- '9200:9200'
|
- "9200:9200"
|
||||||
- '9300:9300'
|
- "9300:9300"
|
||||||
restart_policy: unless-stopped
|
restart_policy: unless-stopped
|
||||||
container_default_behavior: no_defaults
|
container_default_behavior: no_defaults
|
||||||
networks_cli_compatible: yes
|
networks_cli_compatible: yes
|
||||||
|
@ -64,8 +64,49 @@
|
||||||
tags: consul
|
tags: consul
|
||||||
become: yes
|
become: yes
|
||||||
template:
|
template:
|
||||||
src: consul_service.json
|
src: consul_service.json.j2
|
||||||
dest: /etc/consul.d/elasticsearch.json
|
dest: /etc/consul.d/elasticsearch.json
|
||||||
|
vars:
|
||||||
|
consul_service_name: elasticsearch
|
||||||
|
consul_service_port: 9200
|
||||||
|
elasticsearch_version: 7
|
||||||
|
notify:
|
||||||
|
- reload consul
|
||||||
|
|
||||||
|
- name: Start Elasticsearch v8 Docker container
|
||||||
|
docker_container:
|
||||||
|
name: elasticsearch
|
||||||
|
image: elasticsearch:8.17.0
|
||||||
|
state: started
|
||||||
|
hostname: "{{ inventory_hostname }}"
|
||||||
|
ports:
|
||||||
|
- "9208:9200"
|
||||||
|
- "9308:9300"
|
||||||
|
restart_policy: unless-stopped
|
||||||
|
container_default_behavior: no_defaults
|
||||||
|
networks_cli_compatible: yes
|
||||||
|
# network_mode: host
|
||||||
|
network_mode: default
|
||||||
|
networks:
|
||||||
|
- name: newsblurnet
|
||||||
|
aliases:
|
||||||
|
- elasticsearch
|
||||||
|
user: "{{ ansible_effective_user_id|int }}:{{ ansible_effective_group_id|int }}"
|
||||||
|
volumes:
|
||||||
|
- /srv/newsblur/docker/volumes/elasticsearch8:/usr/share/elasticsearch/data
|
||||||
|
- /var/log/elasticsearch8/:/var/log/elasticsearch/
|
||||||
|
- /srv/newsblur/config/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
|
||||||
|
|
||||||
|
- name: Register elasticsearch in consul
|
||||||
|
tags: consul
|
||||||
|
become: yes
|
||||||
|
template:
|
||||||
|
src: consul_service.json.j2
|
||||||
|
dest: /etc/consul.d/elasticsearch.json
|
||||||
|
vars:
|
||||||
|
consul_service_name: elasticsearch
|
||||||
|
consul_service_port: 9208
|
||||||
|
elasticsearch_version: 8
|
||||||
notify:
|
notify:
|
||||||
- reload consul
|
- reload consul
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,18 @@
|
||||||
{
|
{
|
||||||
"service": {
|
"service": {
|
||||||
|
{% if elasticsearch_version == 7 %}
|
||||||
{% if not elasticsearch_secondary %}
|
{% if not elasticsearch_secondary %}
|
||||||
"name": "db-elasticsearch",
|
"name": "db-elasticsearch",
|
||||||
{% else %}
|
{% else %}
|
||||||
"name": "db-elasticsearch-staging",
|
"name": "db-elasticsearch-staging",
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% elif elasticsearch_version == 8 %}
|
||||||
|
"name": "db-elasticsearch-v8",
|
||||||
|
{% endif %}
|
||||||
"tags": [
|
"tags": [
|
||||||
"db"
|
"db"
|
||||||
],
|
],
|
||||||
"port": 9200,
|
"port": {{ consul_service_port }},
|
||||||
"checks": [{
|
"checks": [{
|
||||||
"id": "es-ping",
|
"id": "es-ping",
|
||||||
"http": "http://{{ ansible_host }}:5579/db_check/elasticsearch?consul=1",
|
"http": "http://{{ ansible_host }}:5579/db_check/elasticsearch?consul=1",
|
||||||
|
|
|
@ -8,7 +8,6 @@ from celery.exceptions import SoftTimeLimitExceeded
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from apps.profile.middleware import DBProfilerMiddleware
|
from apps.profile.middleware import DBProfilerMiddleware
|
||||||
from apps.search.models import DiscoverStory
|
|
||||||
from newsblur_web.celeryapp import app
|
from newsblur_web.celeryapp import app
|
||||||
from utils import log as logging
|
from utils import log as logging
|
||||||
from utils.redis_raw_log_middleware import RedisDumpMiddleware
|
from utils.redis_raw_log_middleware import RedisDumpMiddleware
|
||||||
|
|
|
@ -15,6 +15,7 @@ from django.conf import settings
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from openai import APITimeoutError, OpenAI
|
from openai import APITimeoutError, OpenAI
|
||||||
|
|
||||||
|
from apps.search.projection_matrix import project_vector
|
||||||
from apps.search.tasks import (
|
from apps.search.tasks import (
|
||||||
FinishIndexSubscriptionsForDiscover,
|
FinishIndexSubscriptionsForDiscover,
|
||||||
FinishIndexSubscriptionsForSearch,
|
FinishIndexSubscriptionsForSearch,
|
||||||
|
@ -276,7 +277,7 @@ class MUserSearch(mongo.Document):
|
||||||
feed.index_stories_for_search()
|
feed.index_stories_for_search()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def remove_all(cls, drop_index=False):
|
def remove_all(cls, drop_index=False, search=True, discover=True):
|
||||||
# You only need to drop the index if there is data you want to clear.
|
# You only need to drop the index if there is data you want to clear.
|
||||||
# A new search server won't need this, as there isn't anything to drop.
|
# A new search server won't need this, as there isn't anything to drop.
|
||||||
if drop_index:
|
if drop_index:
|
||||||
|
@ -287,11 +288,11 @@ class MUserSearch(mongo.Document):
|
||||||
logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
|
logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
|
||||||
for user_search in user_searches:
|
for user_search in user_searches:
|
||||||
try:
|
try:
|
||||||
user_search.remove()
|
user_search.remove(search=search, discover=discover)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(" ****> Error on search removal: %s" % e)
|
print(" ****> Error on search removal: %s" % e)
|
||||||
|
|
||||||
def remove(self):
|
def remove(self, search=True, discover=True):
|
||||||
from apps.reader.models import UserSubscription
|
from apps.reader.models import UserSubscription
|
||||||
from apps.rss_feeds.models import Feed
|
from apps.rss_feeds.models import Feed
|
||||||
|
|
||||||
|
@ -305,16 +306,24 @@ class MUserSearch(mongo.Document):
|
||||||
feed = sub.feed
|
feed = sub.feed
|
||||||
except Feed.DoesNotExist:
|
except Feed.DoesNotExist:
|
||||||
continue
|
continue
|
||||||
if not feed.search_indexed:
|
if search and not discover and not feed.search_indexed:
|
||||||
continue
|
continue
|
||||||
|
if discover and not search and not feed.discover_indexed:
|
||||||
|
continue
|
||||||
|
if search and discover and not feed.search_indexed and not feed.discover_indexed:
|
||||||
|
continue
|
||||||
|
if search:
|
||||||
feed.search_indexed = False
|
feed.search_indexed = False
|
||||||
|
feed.search_indexing = False
|
||||||
|
if discover:
|
||||||
|
feed.discover_indexed = False
|
||||||
|
feed.discover_indexing = False
|
||||||
feed.save()
|
feed.save()
|
||||||
removed += 1
|
removed += 1
|
||||||
|
|
||||||
logging.user(
|
logging.user(
|
||||||
user,
|
user,
|
||||||
"~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN."
|
f"~FCRemoved ~SB{removed}/{total} feed's {'search' if search and not discover else 'discover' if discover and not search else 'search+discover' if search and discover else 'neither'} indexes~SN for ~SB~FB{user.username}~FC~SN.",
|
||||||
% (removed, total, user.username),
|
|
||||||
)
|
)
|
||||||
self.delete()
|
self.delete()
|
||||||
|
|
||||||
|
@ -705,8 +714,9 @@ class DiscoverStory:
|
||||||
},
|
},
|
||||||
"content_vector": {
|
"content_vector": {
|
||||||
"type": "dense_vector",
|
"type": "dense_vector",
|
||||||
"dims": 1536, # Numbers of dims from text-embedding-3-small
|
"dims": 256, # Reduced from openai embedding size of 1536 to 256
|
||||||
# "store": True, # Keep stored since we need to retrieve it # No need to be explicit
|
"index": True,
|
||||||
|
"index_options": {"type": "bbq_hnsw"}, # Use bbq_hnsw index options for faster search
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -947,7 +957,10 @@ class DiscoverStory:
|
||||||
return []
|
return []
|
||||||
story_embedding = response.data[0].embedding
|
story_embedding = response.data[0].embedding
|
||||||
|
|
||||||
return story_embedding
|
# Project the embedding down to 256 dimensions
|
||||||
|
projected_embedding = project_vector(story_embedding)
|
||||||
|
|
||||||
|
return projected_embedding.tolist()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def debug_index(cls, show_data=True, show_source=False):
|
def debug_index(cls, show_data=True, show_source=False):
|
||||||
|
|
37
apps/search/projection_matrix.py
Normal file
37
apps/search/projection_matrix.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
PROJECTION_MATRIX_PATH = os.path.join(os.path.dirname(__file__), "random_projection_matrix.npy")
|
||||||
|
INPUT_DIMS = 1536
|
||||||
|
OUTPUT_DIMS = 256
|
||||||
|
|
||||||
|
|
||||||
|
def generate_projection_matrix():
|
||||||
|
"""Generate a random projection matrix for dimensionality reduction."""
|
||||||
|
# Use a fixed random seed for reproducibility
|
||||||
|
np.random.seed(42)
|
||||||
|
|
||||||
|
# Generate random matrix
|
||||||
|
projection = np.random.normal(0, 1 / np.sqrt(OUTPUT_DIMS), (OUTPUT_DIMS, INPUT_DIMS))
|
||||||
|
|
||||||
|
# Normalize the matrix
|
||||||
|
projection = projection / np.linalg.norm(projection, axis=1)[:, np.newaxis]
|
||||||
|
|
||||||
|
return projection
|
||||||
|
|
||||||
|
|
||||||
|
def get_projection_matrix():
|
||||||
|
"""Get the projection matrix, generating it if it doesn't exist."""
|
||||||
|
if not os.path.exists(PROJECTION_MATRIX_PATH):
|
||||||
|
projection = generate_projection_matrix()
|
||||||
|
np.save(PROJECTION_MATRIX_PATH, projection)
|
||||||
|
return np.load(PROJECTION_MATRIX_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
def project_vector(vector):
|
||||||
|
"""Project a vector from 1536 dimensions to 256 dimensions."""
|
||||||
|
projection = get_projection_matrix()
|
||||||
|
projected = np.dot(projection, vector)
|
||||||
|
# Normalize the projected vector
|
||||||
|
return projected / np.linalg.norm(projected)
|
BIN
apps/search/random_projection_matrix.npy
Normal file
BIN
apps/search/random_projection_matrix.npy
Normal file
Binary file not shown.
|
@ -129,10 +129,11 @@ SESSION_REDIS_DB = 5
|
||||||
|
|
||||||
ELASTICSEARCH_FEED_HOSTS = ["db_elasticsearch:9200"]
|
ELASTICSEARCH_FEED_HOSTS = ["db_elasticsearch:9200"]
|
||||||
ELASTICSEARCH_STORY_HOSTS = ["db_elasticsearch:9200"]
|
ELASTICSEARCH_STORY_HOSTS = ["db_elasticsearch:9200"]
|
||||||
|
ELASTICSEARCH_DISCOVER_HOSTS = ["db_elasticsearch:9200"]
|
||||||
|
|
||||||
ELASTICSEARCH_FEED_HOST = "http://db_elasticsearch:9200"
|
ELASTICSEARCH_FEED_HOST = "http://db_elasticsearch:9200"
|
||||||
ELASTICSEARCH_STORY_HOST = "http://db_elasticsearch:9200"
|
ELASTICSEARCH_STORY_HOST = "http://db_elasticsearch:9200"
|
||||||
|
ELASTICSEARCH_DISCOVER_HOST = "http://db_elasticsearch:9200"
|
||||||
BACKED_BY_AWS = {
|
BACKED_BY_AWS = {
|
||||||
"pages_on_node": False,
|
"pages_on_node": False,
|
||||||
"pages_on_s3": False,
|
"pages_on_s3": False,
|
||||||
|
|
|
@ -487,6 +487,7 @@ CELERY_BEAT_SCHEDULE = {
|
||||||
# =========
|
# =========
|
||||||
# = Mongo =
|
# = Mongo =
|
||||||
# =========
|
# =========
|
||||||
|
|
||||||
if DOCKERBUILD:
|
if DOCKERBUILD:
|
||||||
MONGO_PORT = 29019
|
MONGO_PORT = 29019
|
||||||
else:
|
else:
|
||||||
|
@ -500,6 +501,14 @@ MONGO_ANALYTICS_DB = {
|
||||||
"name": "nbanalytics",
|
"name": "nbanalytics",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# =================
|
||||||
|
# = Elasticsearch =
|
||||||
|
# =================
|
||||||
|
|
||||||
|
ELASTICSEARCH_FEED_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
|
||||||
|
ELASTICSEARCH_STORY_HOST = "http://db-elasticsearch.service.nyc1.consul:9200"
|
||||||
|
ELASTICSEARCH_DISCOVER_HOST = "http://db-elasticsearch-v8.service.nyc1.consul:9208"
|
||||||
|
|
||||||
# ====================
|
# ====================
|
||||||
# = Database Routers =
|
# = Database Routers =
|
||||||
# ====================
|
# ====================
|
||||||
|
|
Loading…
Add table
Reference in a new issue