mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Moving to OpenAI Embeddings API.
This commit is contained in:
parent
abeb3d45f6
commit
2931e5d578
7 changed files with 54 additions and 32 deletions
|
@ -1070,7 +1070,7 @@ class Feed(models.Model):
|
||||||
|
|
||||||
self.similar_feeds.clear()
|
self.similar_feeds.clear()
|
||||||
for result in results:
|
for result in results:
|
||||||
feed_id = result['_source']['feed_id']
|
feed_id = result["_source"]["feed_id"]
|
||||||
try:
|
try:
|
||||||
self.similar_feeds.add(feed_id)
|
self.similar_feeds.add(feed_id)
|
||||||
except IntegrityError:
|
except IntegrityError:
|
||||||
|
@ -1084,11 +1084,11 @@ class Feed(models.Model):
|
||||||
combined_content_vector = SearchFeed.generate_combined_feed_content_vector(feed_ids)
|
combined_content_vector = SearchFeed.generate_combined_feed_content_vector(feed_ids)
|
||||||
results = SearchFeed.vector_query(combined_content_vector, feed_ids_to_exclude=feed_ids)
|
results = SearchFeed.vector_query(combined_content_vector, feed_ids_to_exclude=feed_ids)
|
||||||
logging.debug(
|
logging.debug(
|
||||||
f"Found {len(results)} recommendations for feeds {feed_ids}: {r['_source']['title'] for r in results}"
|
f"Found {len(results)} recommendations for feeds {feed_ids}: {[r['_source']['title'] for r in results]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _split_favicon_color(self, color=None):
|
def _split_favicon_color(self, color=None):
|
||||||
if not color:
|
if not color:
|
||||||
color = self.favicon_color
|
color = self.favicon_color
|
||||||
|
|
|
@ -12,6 +12,7 @@ import redis
|
||||||
import urllib3
|
import urllib3
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
from apps.search.tasks import (
|
from apps.search.tasks import (
|
||||||
FinishIndexSubscriptionsForSearch,
|
FinishIndexSubscriptionsForSearch,
|
||||||
|
@ -20,6 +21,7 @@ from apps.search.tasks import (
|
||||||
IndexSubscriptionsForSearch,
|
IndexSubscriptionsForSearch,
|
||||||
)
|
)
|
||||||
from utils import log as logging
|
from utils import log as logging
|
||||||
|
from utils.ai_functions import setup_openai_model
|
||||||
from utils.feed_functions import chunks
|
from utils.feed_functions import chunks
|
||||||
|
|
||||||
|
|
||||||
|
@ -491,7 +493,7 @@ class SearchStory:
|
||||||
|
|
||||||
class SearchFeed:
|
class SearchFeed:
|
||||||
_es_client = None
|
_es_client = None
|
||||||
name = "discover-feeds"
|
name = "discover-feeds-openai"
|
||||||
model = None
|
model = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -578,7 +580,7 @@ class SearchFeed:
|
||||||
},
|
},
|
||||||
"content_vector": {
|
"content_vector": {
|
||||||
"type": "dense_vector",
|
"type": "dense_vector",
|
||||||
"dims": 384, # Numbers of dims from all-MiniLM-L6-v2
|
"dims": 1536, # Numbers of dims from text-embedding-3-small
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
cls.ES().indices.put_mapping(
|
cls.ES().indices.put_mapping(
|
||||||
|
@ -774,27 +776,11 @@ class SearchFeed:
|
||||||
def generate_feed_content_vector(cls, feed_id):
|
def generate_feed_content_vector(cls, feed_id):
|
||||||
from apps.rss_feeds.models import Feed
|
from apps.rss_feeds.models import Feed
|
||||||
|
|
||||||
if cls.model is None:
|
|
||||||
logging.debug(" ---> ~BG~FBLoading SentenceTransformer model")
|
|
||||||
start_time = time.time()
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
|
|
||||||
logging.debug(" ---> ~BG~FGDownloading SentenceTransformer model")
|
|
||||||
|
|
||||||
cls.model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
||||||
logging.debug(
|
|
||||||
f" ---> ~FG~SNModel loaded, took ~SB{round(time.time() - start_time, 2)}~SN seconds"
|
|
||||||
)
|
|
||||||
|
|
||||||
feed = Feed.objects.get(id=feed_id)
|
feed = Feed.objects.get(id=feed_id)
|
||||||
|
|
||||||
# cross_encoder = CrossEncoder("BAAI/bge-large-zh-v2", device="cpu")
|
|
||||||
# cross_encoder.encode([feed.feed_title, feed.feed_content], convert_to_tensors="all")
|
|
||||||
|
|
||||||
stories = feed.get_stories()
|
stories = feed.get_stories()
|
||||||
stories_text = ""
|
stories_text = ""
|
||||||
for story in stories:
|
for story in stories:
|
||||||
# stories_text += f"{story['story_title']} {story['story_authors']} {story['story_content']}"
|
|
||||||
stories_text += f"{story['story_title']} {' '.join([tag for tag in story['story_tags']])}"
|
stories_text += f"{story['story_title']} {' '.join([tag for tag in story['story_tags']])}"
|
||||||
text = f"{feed.feed_title} {feed.data.feed_tagline} {stories_text}"
|
text = f"{feed.feed_title} {feed.data.feed_tagline} {stories_text}"
|
||||||
|
|
||||||
|
@ -810,12 +796,24 @@ class SearchFeed:
|
||||||
# Remove extra whitespace
|
# Remove extra whitespace
|
||||||
text = " ".join(text.split())
|
text = " ".join(text.split())
|
||||||
|
|
||||||
encoded_text = cls.model.encode(text)
|
# Send to OpenAI
|
||||||
normalized_embedding = encoded_text / np.linalg.norm(encoded_text)
|
model_name = "text-embedding-3-small"
|
||||||
|
encoding = setup_openai_model(model_name)
|
||||||
|
|
||||||
# logging.debug(f" ---> ~FGNormalized embedding for feed {feed_id}: {normalized_embedding}")
|
# Truncate the text to the maximum number of tokens
|
||||||
|
max_tokens = 8191 # Maximum for text-embedding-3-small
|
||||||
|
encoded_text = encoding.encode(text)
|
||||||
|
truncated_tokens = encoded_text[:max_tokens]
|
||||||
|
truncated_text = encoding.decode(truncated_tokens)
|
||||||
|
|
||||||
return normalized_embedding
|
client = OpenAI(api_key=settings.OPENAI_API_KEY)
|
||||||
|
|
||||||
|
response = client.embeddings.create(model=model_name, input=truncated_text)
|
||||||
|
|
||||||
|
embedding = response.data[0].embedding
|
||||||
|
# normalized_embedding = np.array(embedding) / np.linalg.norm(embedding)
|
||||||
|
|
||||||
|
return embedding
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def export_csv(cls):
|
def export_csv(cls):
|
||||||
|
|
|
@ -70,7 +70,7 @@ nose-exclude==0.5.0
|
||||||
numpy==1.26.4
|
numpy==1.26.4
|
||||||
oauth2==1.9.0.post1
|
oauth2==1.9.0.post1
|
||||||
oauthlib==3.1.0
|
oauthlib==3.1.0
|
||||||
openai~=0.27
|
openai~=1.51.2
|
||||||
paypalrestsdk==1.13.1
|
paypalrestsdk==1.13.1
|
||||||
pbr==5.6.0
|
pbr==5.6.0
|
||||||
Pillow==8.0.1
|
Pillow==8.0.1
|
||||||
|
@ -101,7 +101,6 @@ redis>=4,<5
|
||||||
requests>=2.25.0,<3
|
requests>=2.25.0,<3
|
||||||
requests-oauthlib==1.3.0
|
requests-oauthlib==1.3.0
|
||||||
scipy==1.12.0
|
scipy==1.12.0
|
||||||
sentence_transformers==3.0.1
|
|
||||||
sentry-sdk==1.44.1
|
sentry-sdk==1.44.1
|
||||||
sgmllib3k==1.0.0
|
sgmllib3k==1.0.0
|
||||||
simplejson==3.17.2
|
simplejson==3.17.2
|
||||||
|
|
|
@ -24,6 +24,13 @@ RUN set -ex \
|
||||||
' \
|
' \
|
||||||
&& apt-get update \
|
&& apt-get update \
|
||||||
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends \
|
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends \
|
||||||
&& pip install -r requirements.txt \
|
&& pip install uv \
|
||||||
|
&& uv clean || true \
|
||||||
|
&& rm -rf /venv \
|
||||||
|
&& uv venv /venv \
|
||||||
|
&& PATH="/venv/bin:$PATH" \
|
||||||
|
&& VIRTUAL_ENV="/venv" \
|
||||||
|
&& rm -rf /root/.cache/uv \
|
||||||
|
&& uv pip install -r requirements.txt \
|
||||||
&& apt-get purge -y --auto-remove ${buildDeps} \
|
&& apt-get purge -y --auto-remove ${buildDeps} \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
|
@ -25,8 +25,25 @@ RUN set -ex \
|
||||||
&& apt-get update \
|
&& apt-get update \
|
||||||
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends
|
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends
|
||||||
COPY config/requirements.txt /srv/newsblur/
|
COPY config/requirements.txt /srv/newsblur/
|
||||||
RUN pip install -U pip==24
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
# Install Rust (required for tiktoken)
|
||||||
RUN pip cache purge
|
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
|
||||||
|
# Install uv
|
||||||
|
RUN pip install uv
|
||||||
|
|
||||||
|
# Clean uv cache and any virtual environment from previous builds
|
||||||
|
RUN uv clean || true && rm -rf /venv
|
||||||
|
|
||||||
|
# Create and activate virtual environment in /venv
|
||||||
|
RUN uv venv /venv
|
||||||
|
ENV PATH="/venv/bin:$PATH"
|
||||||
|
ENV VIRTUAL_ENV="/venv"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN rm -rf /root/.cache/uv && \
|
||||||
|
uv pip install -r requirements.txt
|
||||||
|
|
||||||
RUN apt-get purge -y --auto-remove ${buildDeps}
|
RUN apt-get purge -y --auto-remove ${buildDeps}
|
||||||
RUN rm -rf /var/lib/apt/lists/*
|
RUN rm -rf /var/lib/apt/lists/*
|
||||||
|
|
|
@ -139,6 +139,7 @@ BACKED_BY_AWS = {
|
||||||
"icons_on_s3": False,
|
"icons_on_s3": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OPENAI_API_KEY = "sk-svcacct-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||||
|
|
||||||
# ===========
|
# ===========
|
||||||
# = Logging =
|
# = Logging =
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
"description": "Servers used in running NewsBlur",
|
"description": "Servers used in running NewsBlur",
|
||||||
"main": "favicons.js",
|
"main": "favicons.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@postlight/mercury-parser": "^2.2.3",
|
"@postlight/parser": "^2.2.3",
|
||||||
"@sentry/browser": "^6.12.0",
|
"@sentry/browser": "^6.12.0",
|
||||||
"@sentry/node": "^6.12.0",
|
"@sentry/node": "^6.12.0",
|
||||||
"@sentry/tracing": "^6.12.0",
|
"@sentry/tracing": "^6.12.0",
|
||||||
|
|
Loading…
Add table
Reference in a new issue