Moving to OpenAI Embeddings API.

This commit is contained in:
Samuel Clay 2024-10-10 21:56:49 -07:00
parent abeb3d45f6
commit 2931e5d578
7 changed files with 54 additions and 32 deletions

View file

@ -1070,7 +1070,7 @@ class Feed(models.Model):
self.similar_feeds.clear() self.similar_feeds.clear()
for result in results: for result in results:
feed_id = result['_source']['feed_id'] feed_id = result["_source"]["feed_id"]
try: try:
self.similar_feeds.add(feed_id) self.similar_feeds.add(feed_id)
except IntegrityError: except IntegrityError:
@ -1084,11 +1084,11 @@ class Feed(models.Model):
combined_content_vector = SearchFeed.generate_combined_feed_content_vector(feed_ids) combined_content_vector = SearchFeed.generate_combined_feed_content_vector(feed_ids)
results = SearchFeed.vector_query(combined_content_vector, feed_ids_to_exclude=feed_ids) results = SearchFeed.vector_query(combined_content_vector, feed_ids_to_exclude=feed_ids)
logging.debug( logging.debug(
f"Found {len(results)} recommendations for feeds {feed_ids}: {r['_source']['title'] for r in results}" f"Found {len(results)} recommendations for feeds {feed_ids}: {[r['_source']['title'] for r in results]}"
) )
return results return results
def _split_favicon_color(self, color=None): def _split_favicon_color(self, color=None):
if not color: if not color:
color = self.favicon_color color = self.favicon_color

View file

@ -12,6 +12,7 @@ import redis
import urllib3 import urllib3
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
from openai import OpenAI
from apps.search.tasks import ( from apps.search.tasks import (
FinishIndexSubscriptionsForSearch, FinishIndexSubscriptionsForSearch,
@ -20,6 +21,7 @@ from apps.search.tasks import (
IndexSubscriptionsForSearch, IndexSubscriptionsForSearch,
) )
from utils import log as logging from utils import log as logging
from utils.ai_functions import setup_openai_model
from utils.feed_functions import chunks from utils.feed_functions import chunks
@ -491,7 +493,7 @@ class SearchStory:
class SearchFeed: class SearchFeed:
_es_client = None _es_client = None
name = "discover-feeds" name = "discover-feeds-openai"
model = None model = None
@classmethod @classmethod
@ -578,7 +580,7 @@ class SearchFeed:
}, },
"content_vector": { "content_vector": {
"type": "dense_vector", "type": "dense_vector",
"dims": 384, # Numbers of dims from all-MiniLM-L6-v2 "dims": 1536, # Numbers of dims from text-embedding-3-small
}, },
} }
cls.ES().indices.put_mapping( cls.ES().indices.put_mapping(
@ -774,27 +776,11 @@ class SearchFeed:
def generate_feed_content_vector(cls, feed_id): def generate_feed_content_vector(cls, feed_id):
from apps.rss_feeds.models import Feed from apps.rss_feeds.models import Feed
if cls.model is None:
logging.debug(" ---> ~BG~FBLoading SentenceTransformer model")
start_time = time.time()
from sentence_transformers import SentenceTransformer
logging.debug(" ---> ~BG~FGDownloading SentenceTransformer model")
cls.model = SentenceTransformer("all-MiniLM-L6-v2")
logging.debug(
f" ---> ~FG~SNModel loaded, took ~SB{round(time.time() - start_time, 2)}~SN seconds"
)
feed = Feed.objects.get(id=feed_id) feed = Feed.objects.get(id=feed_id)
# cross_encoder = CrossEncoder("BAAI/bge-large-zh-v2", device="cpu")
# cross_encoder.encode([feed.feed_title, feed.feed_content], convert_to_tensors="all")
stories = feed.get_stories() stories = feed.get_stories()
stories_text = "" stories_text = ""
for story in stories: for story in stories:
# stories_text += f"{story['story_title']} {story['story_authors']} {story['story_content']}"
stories_text += f"{story['story_title']} {' '.join([tag for tag in story['story_tags']])}" stories_text += f"{story['story_title']} {' '.join([tag for tag in story['story_tags']])}"
text = f"{feed.feed_title} {feed.data.feed_tagline} {stories_text}" text = f"{feed.feed_title} {feed.data.feed_tagline} {stories_text}"
@ -810,12 +796,24 @@ class SearchFeed:
# Remove extra whitespace # Remove extra whitespace
text = " ".join(text.split()) text = " ".join(text.split())
encoded_text = cls.model.encode(text) # Send to OpenAI
normalized_embedding = encoded_text / np.linalg.norm(encoded_text) model_name = "text-embedding-3-small"
encoding = setup_openai_model(model_name)
# logging.debug(f" ---> ~FGNormalized embedding for feed {feed_id}: {normalized_embedding}") # Truncate the text to the maximum number of tokens
max_tokens = 8191 # Maximum for text-embedding-3-small
encoded_text = encoding.encode(text)
truncated_tokens = encoded_text[:max_tokens]
truncated_text = encoding.decode(truncated_tokens)
return normalized_embedding client = OpenAI(api_key=settings.OPENAI_API_KEY)
response = client.embeddings.create(model=model_name, input=truncated_text)
embedding = response.data[0].embedding
# normalized_embedding = np.array(embedding) / np.linalg.norm(embedding)
return embedding
@classmethod @classmethod
def export_csv(cls): def export_csv(cls):

View file

@ -70,7 +70,7 @@ nose-exclude==0.5.0
numpy==1.26.4 numpy==1.26.4
oauth2==1.9.0.post1 oauth2==1.9.0.post1
oauthlib==3.1.0 oauthlib==3.1.0
openai~=0.27 openai~=1.51.2
paypalrestsdk==1.13.1 paypalrestsdk==1.13.1
pbr==5.6.0 pbr==5.6.0
Pillow==8.0.1 Pillow==8.0.1
@ -101,7 +101,6 @@ redis>=4,<5
requests>=2.25.0,<3 requests>=2.25.0,<3
requests-oauthlib==1.3.0 requests-oauthlib==1.3.0
scipy==1.12.0 scipy==1.12.0
sentence_transformers==3.0.1
sentry-sdk==1.44.1 sentry-sdk==1.44.1
sgmllib3k==1.0.0 sgmllib3k==1.0.0
simplejson==3.17.2 simplejson==3.17.2

View file

@ -24,6 +24,13 @@ RUN set -ex \
' \ ' \
&& apt-get update \ && apt-get update \
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends \ && apt-get install -y $rundDeps $buildDeps --no-install-recommends \
&& pip install -r requirements.txt \ && pip install uv \
&& uv clean || true \
&& rm -rf /venv \
&& uv venv /venv \
&& PATH="/venv/bin:$PATH" \
&& VIRTUAL_ENV="/venv" \
&& rm -rf /root/.cache/uv \
&& uv pip install -r requirements.txt \
&& apt-get purge -y --auto-remove ${buildDeps} \ && apt-get purge -y --auto-remove ${buildDeps} \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*

View file

@ -25,8 +25,25 @@ RUN set -ex \
&& apt-get update \ && apt-get update \
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends && apt-get install -y $rundDeps $buildDeps --no-install-recommends
COPY config/requirements.txt /srv/newsblur/ COPY config/requirements.txt /srv/newsblur/
RUN pip install -U pip==24
RUN pip install --no-cache-dir -r requirements.txt # Install Rust (required for tiktoken)
RUN pip cache purge RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Install uv
RUN pip install uv
# Clean uv cache and any virtual environment from previous builds
RUN uv clean || true && rm -rf /venv
# Create and activate virtual environment in /venv
RUN uv venv /venv
ENV PATH="/venv/bin:$PATH"
ENV VIRTUAL_ENV="/venv"
# Install dependencies
RUN rm -rf /root/.cache/uv && \
uv pip install -r requirements.txt
RUN apt-get purge -y --auto-remove ${buildDeps} RUN apt-get purge -y --auto-remove ${buildDeps}
RUN rm -rf /var/lib/apt/lists/* RUN rm -rf /var/lib/apt/lists/*

View file

@ -139,6 +139,7 @@ BACKED_BY_AWS = {
"icons_on_s3": False, "icons_on_s3": False,
} }
OPENAI_API_KEY = "sk-svcacct-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# =========== # ===========
# = Logging = # = Logging =

View file

@ -4,7 +4,7 @@
"description": "Servers used in running NewsBlur", "description": "Servers used in running NewsBlur",
"main": "favicons.js", "main": "favicons.js",
"dependencies": { "dependencies": {
"@postlight/mercury-parser": "^2.2.3", "@postlight/parser": "^2.2.3",
"@sentry/browser": "^6.12.0", "@sentry/browser": "^6.12.0",
"@sentry/node": "^6.12.0", "@sentry/node": "^6.12.0",
"@sentry/tracing": "^6.12.0", "@sentry/tracing": "^6.12.0",