mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Moving to OpenAI Embeddings API.
This commit is contained in:
parent
abeb3d45f6
commit
2931e5d578
7 changed files with 54 additions and 32 deletions
|
@ -1070,7 +1070,7 @@ class Feed(models.Model):
|
|||
|
||||
self.similar_feeds.clear()
|
||||
for result in results:
|
||||
feed_id = result['_source']['feed_id']
|
||||
feed_id = result["_source"]["feed_id"]
|
||||
try:
|
||||
self.similar_feeds.add(feed_id)
|
||||
except IntegrityError:
|
||||
|
@ -1084,11 +1084,11 @@ class Feed(models.Model):
|
|||
combined_content_vector = SearchFeed.generate_combined_feed_content_vector(feed_ids)
|
||||
results = SearchFeed.vector_query(combined_content_vector, feed_ids_to_exclude=feed_ids)
|
||||
logging.debug(
|
||||
f"Found {len(results)} recommendations for feeds {feed_ids}: {r['_source']['title'] for r in results}"
|
||||
f"Found {len(results)} recommendations for feeds {feed_ids}: {[r['_source']['title'] for r in results]}"
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _split_favicon_color(self, color=None):
|
||||
if not color:
|
||||
color = self.favicon_color
|
||||
|
|
|
@ -12,6 +12,7 @@ import redis
|
|||
import urllib3
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from openai import OpenAI
|
||||
|
||||
from apps.search.tasks import (
|
||||
FinishIndexSubscriptionsForSearch,
|
||||
|
@ -20,6 +21,7 @@ from apps.search.tasks import (
|
|||
IndexSubscriptionsForSearch,
|
||||
)
|
||||
from utils import log as logging
|
||||
from utils.ai_functions import setup_openai_model
|
||||
from utils.feed_functions import chunks
|
||||
|
||||
|
||||
|
@ -491,7 +493,7 @@ class SearchStory:
|
|||
|
||||
class SearchFeed:
|
||||
_es_client = None
|
||||
name = "discover-feeds"
|
||||
name = "discover-feeds-openai"
|
||||
model = None
|
||||
|
||||
@classmethod
|
||||
|
@ -578,7 +580,7 @@ class SearchFeed:
|
|||
},
|
||||
"content_vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 384, # Numbers of dims from all-MiniLM-L6-v2
|
||||
"dims": 1536, # Numbers of dims from text-embedding-3-small
|
||||
},
|
||||
}
|
||||
cls.ES().indices.put_mapping(
|
||||
|
@ -774,27 +776,11 @@ class SearchFeed:
|
|||
def generate_feed_content_vector(cls, feed_id):
|
||||
from apps.rss_feeds.models import Feed
|
||||
|
||||
if cls.model is None:
|
||||
logging.debug(" ---> ~BG~FBLoading SentenceTransformer model")
|
||||
start_time = time.time()
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
logging.debug(" ---> ~BG~FGDownloading SentenceTransformer model")
|
||||
|
||||
cls.model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
logging.debug(
|
||||
f" ---> ~FG~SNModel loaded, took ~SB{round(time.time() - start_time, 2)}~SN seconds"
|
||||
)
|
||||
|
||||
feed = Feed.objects.get(id=feed_id)
|
||||
|
||||
# cross_encoder = CrossEncoder("BAAI/bge-large-zh-v2", device="cpu")
|
||||
# cross_encoder.encode([feed.feed_title, feed.feed_content], convert_to_tensors="all")
|
||||
|
||||
stories = feed.get_stories()
|
||||
stories_text = ""
|
||||
for story in stories:
|
||||
# stories_text += f"{story['story_title']} {story['story_authors']} {story['story_content']}"
|
||||
stories_text += f"{story['story_title']} {' '.join([tag for tag in story['story_tags']])}"
|
||||
text = f"{feed.feed_title} {feed.data.feed_tagline} {stories_text}"
|
||||
|
||||
|
@ -810,12 +796,24 @@ class SearchFeed:
|
|||
# Remove extra whitespace
|
||||
text = " ".join(text.split())
|
||||
|
||||
encoded_text = cls.model.encode(text)
|
||||
normalized_embedding = encoded_text / np.linalg.norm(encoded_text)
|
||||
# Send to OpenAI
|
||||
model_name = "text-embedding-3-small"
|
||||
encoding = setup_openai_model(model_name)
|
||||
|
||||
# logging.debug(f" ---> ~FGNormalized embedding for feed {feed_id}: {normalized_embedding}")
|
||||
# Truncate the text to the maximum number of tokens
|
||||
max_tokens = 8191 # Maximum for text-embedding-3-small
|
||||
encoded_text = encoding.encode(text)
|
||||
truncated_tokens = encoded_text[:max_tokens]
|
||||
truncated_text = encoding.decode(truncated_tokens)
|
||||
|
||||
return normalized_embedding
|
||||
client = OpenAI(api_key=settings.OPENAI_API_KEY)
|
||||
|
||||
response = client.embeddings.create(model=model_name, input=truncated_text)
|
||||
|
||||
embedding = response.data[0].embedding
|
||||
# normalized_embedding = np.array(embedding) / np.linalg.norm(embedding)
|
||||
|
||||
return embedding
|
||||
|
||||
@classmethod
|
||||
def export_csv(cls):
|
||||
|
|
|
@ -70,7 +70,7 @@ nose-exclude==0.5.0
|
|||
numpy==1.26.4
|
||||
oauth2==1.9.0.post1
|
||||
oauthlib==3.1.0
|
||||
openai~=0.27
|
||||
openai~=1.51.2
|
||||
paypalrestsdk==1.13.1
|
||||
pbr==5.6.0
|
||||
Pillow==8.0.1
|
||||
|
@ -101,7 +101,6 @@ redis>=4,<5
|
|||
requests>=2.25.0,<3
|
||||
requests-oauthlib==1.3.0
|
||||
scipy==1.12.0
|
||||
sentence_transformers==3.0.1
|
||||
sentry-sdk==1.44.1
|
||||
sgmllib3k==1.0.0
|
||||
simplejson==3.17.2
|
||||
|
|
|
@ -24,6 +24,13 @@ RUN set -ex \
|
|||
' \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends \
|
||||
&& pip install -r requirements.txt \
|
||||
&& pip install uv \
|
||||
&& uv clean || true \
|
||||
&& rm -rf /venv \
|
||||
&& uv venv /venv \
|
||||
&& PATH="/venv/bin:$PATH" \
|
||||
&& VIRTUAL_ENV="/venv" \
|
||||
&& rm -rf /root/.cache/uv \
|
||||
&& uv pip install -r requirements.txt \
|
||||
&& apt-get purge -y --auto-remove ${buildDeps} \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
|
|
@ -25,8 +25,25 @@ RUN set -ex \
|
|||
&& apt-get update \
|
||||
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends
|
||||
COPY config/requirements.txt /srv/newsblur/
|
||||
RUN pip install -U pip==24
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip cache purge
|
||||
|
||||
# Install Rust (required for tiktoken)
|
||||
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Install uv
|
||||
RUN pip install uv
|
||||
|
||||
# Clean uv cache and any virtual environment from previous builds
|
||||
RUN uv clean || true && rm -rf /venv
|
||||
|
||||
# Create and activate virtual environment in /venv
|
||||
RUN uv venv /venv
|
||||
ENV PATH="/venv/bin:$PATH"
|
||||
ENV VIRTUAL_ENV="/venv"
|
||||
|
||||
# Install dependencies
|
||||
RUN rm -rf /root/.cache/uv && \
|
||||
uv pip install -r requirements.txt
|
||||
|
||||
RUN apt-get purge -y --auto-remove ${buildDeps}
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
|
|
@ -139,6 +139,7 @@ BACKED_BY_AWS = {
|
|||
"icons_on_s3": False,
|
||||
}
|
||||
|
||||
OPENAI_API_KEY = "sk-svcacct-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||
|
||||
# ===========
|
||||
# = Logging =
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
"description": "Servers used in running NewsBlur",
|
||||
"main": "favicons.js",
|
||||
"dependencies": {
|
||||
"@postlight/mercury-parser": "^2.2.3",
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"@sentry/browser": "^6.12.0",
|
||||
"@sentry/node": "^6.12.0",
|
||||
"@sentry/tracing": "^6.12.0",
|
||||
|
|
Loading…
Add table
Reference in a new issue