Moving to OpenAI Embeddings API.

This commit is contained in:
Samuel Clay 2024-10-10 21:56:49 -07:00
parent abeb3d45f6
commit 2931e5d578
7 changed files with 54 additions and 32 deletions

View file

@ -1070,7 +1070,7 @@ class Feed(models.Model):
self.similar_feeds.clear()
for result in results:
feed_id = result['_source']['feed_id']
feed_id = result["_source"]["feed_id"]
try:
self.similar_feeds.add(feed_id)
except IntegrityError:
@ -1084,11 +1084,11 @@ class Feed(models.Model):
combined_content_vector = SearchFeed.generate_combined_feed_content_vector(feed_ids)
results = SearchFeed.vector_query(combined_content_vector, feed_ids_to_exclude=feed_ids)
logging.debug(
f"Found {len(results)} recommendations for feeds {feed_ids}: {r['_source']['title'] for r in results}"
f"Found {len(results)} recommendations for feeds {feed_ids}: {[r['_source']['title'] for r in results]}"
)
return results
def _split_favicon_color(self, color=None):
if not color:
color = self.favicon_color

View file

@ -12,6 +12,7 @@ import redis
import urllib3
from django.conf import settings
from django.contrib.auth.models import User
from openai import OpenAI
from apps.search.tasks import (
FinishIndexSubscriptionsForSearch,
@ -20,6 +21,7 @@ from apps.search.tasks import (
IndexSubscriptionsForSearch,
)
from utils import log as logging
from utils.ai_functions import setup_openai_model
from utils.feed_functions import chunks
@ -491,7 +493,7 @@ class SearchStory:
class SearchFeed:
_es_client = None
name = "discover-feeds"
name = "discover-feeds-openai"
model = None
@classmethod
@ -578,7 +580,7 @@ class SearchFeed:
},
"content_vector": {
"type": "dense_vector",
"dims": 384, # Numbers of dims from all-MiniLM-L6-v2
"dims": 1536, # Numbers of dims from text-embedding-3-small
},
}
cls.ES().indices.put_mapping(
@ -774,27 +776,11 @@ class SearchFeed:
def generate_feed_content_vector(cls, feed_id):
from apps.rss_feeds.models import Feed
if cls.model is None:
logging.debug(" ---> ~BG~FBLoading SentenceTransformer model")
start_time = time.time()
from sentence_transformers import SentenceTransformer
logging.debug(" ---> ~BG~FGDownloading SentenceTransformer model")
cls.model = SentenceTransformer("all-MiniLM-L6-v2")
logging.debug(
f" ---> ~FG~SNModel loaded, took ~SB{round(time.time() - start_time, 2)}~SN seconds"
)
feed = Feed.objects.get(id=feed_id)
# cross_encoder = CrossEncoder("BAAI/bge-large-zh-v2", device="cpu")
# cross_encoder.encode([feed.feed_title, feed.feed_content], convert_to_tensors="all")
stories = feed.get_stories()
stories_text = ""
for story in stories:
# stories_text += f"{story['story_title']} {story['story_authors']} {story['story_content']}"
stories_text += f"{story['story_title']} {' '.join([tag for tag in story['story_tags']])}"
text = f"{feed.feed_title} {feed.data.feed_tagline} {stories_text}"
@ -810,12 +796,24 @@ class SearchFeed:
# Remove extra whitespace
text = " ".join(text.split())
encoded_text = cls.model.encode(text)
normalized_embedding = encoded_text / np.linalg.norm(encoded_text)
# Send to OpenAI
model_name = "text-embedding-3-small"
encoding = setup_openai_model(model_name)
# logging.debug(f" ---> ~FGNormalized embedding for feed {feed_id}: {normalized_embedding}")
# Truncate the text to the maximum number of tokens
max_tokens = 8191 # Maximum for text-embedding-3-small
encoded_text = encoding.encode(text)
truncated_tokens = encoded_text[:max_tokens]
truncated_text = encoding.decode(truncated_tokens)
return normalized_embedding
client = OpenAI(api_key=settings.OPENAI_API_KEY)
response = client.embeddings.create(model=model_name, input=truncated_text)
embedding = response.data[0].embedding
# normalized_embedding = np.array(embedding) / np.linalg.norm(embedding)
return embedding
@classmethod
def export_csv(cls):

View file

@ -70,7 +70,7 @@ nose-exclude==0.5.0
numpy==1.26.4
oauth2==1.9.0.post1
oauthlib==3.1.0
openai~=0.27
openai~=1.51.2
paypalrestsdk==1.13.1
pbr==5.6.0
Pillow==8.0.1
@ -101,7 +101,6 @@ redis>=4,<5
requests>=2.25.0,<3
requests-oauthlib==1.3.0
scipy==1.12.0
sentence_transformers==3.0.1
sentry-sdk==1.44.1
sgmllib3k==1.0.0
simplejson==3.17.2

View file

@ -24,6 +24,13 @@ RUN set -ex \
' \
&& apt-get update \
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends \
&& pip install -r requirements.txt \
&& pip install uv \
&& uv clean || true \
&& rm -rf /venv \
&& uv venv /venv \
&& PATH="/venv/bin:$PATH" \
&& VIRTUAL_ENV="/venv" \
&& rm -rf /root/.cache/uv \
&& uv pip install -r requirements.txt \
&& apt-get purge -y --auto-remove ${buildDeps} \
&& rm -rf /var/lib/apt/lists/*

View file

@ -25,8 +25,25 @@ RUN set -ex \
&& apt-get update \
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends
COPY config/requirements.txt /srv/newsblur/
RUN pip install -U pip==24
RUN pip install --no-cache-dir -r requirements.txt
RUN pip cache purge
# Install Rust (required for tiktoken)
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Install uv
RUN pip install uv
# Clean uv cache and any virtual environment from previous builds
RUN uv clean || true && rm -rf /venv
# Create and activate virtual environment in /venv
RUN uv venv /venv
ENV PATH="/venv/bin:$PATH"
ENV VIRTUAL_ENV="/venv"
# Install dependencies
RUN rm -rf /root/.cache/uv && \
uv pip install -r requirements.txt
RUN apt-get purge -y --auto-remove ${buildDeps}
RUN rm -rf /var/lib/apt/lists/*

View file

@ -139,6 +139,7 @@ BACKED_BY_AWS = {
"icons_on_s3": False,
}
OPENAI_API_KEY = "sk-svcacct-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# ===========
# = Logging =

View file

@ -4,7 +4,7 @@
"description": "Servers used in running NewsBlur",
"main": "favicons.js",
"dependencies": {
"@postlight/mercury-parser": "^2.2.3",
"@postlight/parser": "^2.2.3",
"@sentry/browser": "^6.12.0",
"@sentry/node": "^6.12.0",
"@sentry/tracing": "^6.12.0",