Running scikit-surprise

This commit is contained in:
Samuel Clay 2023-10-11 09:06:44 -04:00
parent 663b5f979a
commit 2feaf087e7
7 changed files with 40 additions and 16 deletions

13
.vscode/settings.json vendored
View file

@ -1,7 +1,18 @@
{
"black-formatter.args": [
"--line-length 110"
],
"isort.args": [
"--profile",
"black"
],
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": true
},
"python.linting.enabled": true,
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.linting.flake8Enabled": false,
"python.linting.pylamaEnabled": false,
"python.linting.flake8Args": [
"--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"

View file

@ -69,6 +69,10 @@ jekyll:
cd blog && bundle exec jekyll serve
jekyll_drafts:
cd blog && bundle exec jekyll serve --drafts
lint:
docker exec -it newsblur_web isort --profile black .
docker exec -it newsblur_web black --line-length 110 .
docker exec -it newsblur_web flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=venv
# runs tests
test:
@ -198,7 +202,7 @@ mongorestore:
cp -fr docker/volumes/mongodump docker/volumes/db_mongo/
docker exec -it db_mongo mongorestore --port 29019 -d newsblur /data/db/mongodump/newsblur
pgrestore:
docker exec -it db_postgres bash -c "psql -U newsblur -c 'CREATE DATABASE newsblur_prod;'; pg_restore -U newsblur --role=newsblur --dbname=newsblur_prod /var/lib/postgresql/data/backup_postgresql_2023-10-09-04-00.sql.sql"
docker exec -it db_postgres bash -c "psql -U newsblur -c 'CREATE DATABASE newsblur_prod;'; pg_restore -U newsblur --role=newsblur --dbname=newsblur_prod /var/lib/postgresql/data/backup_postgresql_2023-10-10-04-00.sql.sql"
index_feeds:
docker exec -it newsblur_web ./manage.py index_feeds
index_stories:

View file

@ -1,4 +1,6 @@
from django.conf import settings
from django.core.management.base import BaseCommand
from apps.recommendations.models import CollaborativelyFilteredRecommendation
@ -15,7 +17,7 @@ class Command(BaseCommand):
def handle(self, *args, **options):
# Store user feed data to file
file_name = "user_feed_data.csv"
file_name = f"{settings.SURPRISE_DATA_FOLDER}/user_feed_data_2.csv"
CollaborativelyFilteredRecommendation.store_user_feed_data_to_file(file_name)
# Load data and get the trained model

View file

@ -1,15 +1,17 @@
import tempfile
import mongoengine as mongo
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import Reader, Dataset
from django.db import models
from django.contrib.auth.models import User
from apps.rss_feeds.models import Feed
from apps.reader.models import UserSubscription, UserSubscriptionFolders
from utils import json_functions as json
from collections import defaultdict
import mongoengine as mongo
from django.contrib.auth.models import User
from django.core.paginator import Paginator
from django.db import models
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from apps.reader.models import UserSubscription, UserSubscriptionFolders
from apps.rss_feeds.models import Feed
from utils import json_functions as json
class RecommendedFeed(models.Model):
feed = models.ForeignKey(Feed, related_name="recommendations", on_delete=models.CASCADE)
@ -80,15 +82,16 @@ class MFeedFolder(mongo.Document):
class CollaborativelyFilteredRecommendation(models.Model):
@classmethod
def store_user_feed_data_to_file(cls, file_name="user_feed_data.csv"):
def store_user_feed_data_to_file(cls, file_name):
temp_file = open(file_name, "w+")
users = User.objects.all()
users = User.objects.all().order_by("pk")
paginator = Paginator(users, 1000)
for page_num in paginator.page_range:
users = paginator.page(page_num)
for user in users:
# Only include feeds with num_subscribers >= 5
subs = UserSubscription.objects.filter(user=user, feed__num_subscribers__gte=5)
# print(f"User {user} has {subs.count()} feeds")
for sub in subs:
temp_file.write(f"{user.id},{sub.feed_id},1\n")
print(f"Page {page_num} of {paginator.num_pages} saved to {file_name}")

View file

@ -103,7 +103,7 @@ pyzmq==22.0.3
raven==6.10.0
# readability-lxml==0.8.1.1 # Was vendorized due to noscript # Vendorized again due to 0.8.1.1 not out yet
redis>=4,<5
requests==2.25.0
requests>=2.25.0,<3
requests-oauthlib==1.3.0
scipy==1.5.4
sentry-sdk>=1,<2
@ -116,7 +116,7 @@ sqlparse==0.4.1
stevedore==3.3.0
stripe==2.55.1
subdomains==3.0.1
surprise==1.1.3
scikit-surprise~=1.1.3
text-unidecode==1.3
tiktoken~=0.4.0
toml==0.10.2

View file

@ -11,6 +11,7 @@ services:
user: "${CURRENT_UID}:${CURRENT_GID}"
environment:
- DOCKERBUILD=True
- SURPRISE_DATA_FOLDER=/srv/newsblur/docker/volumes/suprise
- RUNWITHMAKEBUILD=${RUNWITHMAKEBUILD?Use the `make` command instead of docker CLI}
stdin_open: true
tty: true
@ -154,6 +155,7 @@ services:
- ${PWD}:/srv/newsblur
environment:
- DOCKERBUILD=True
- SURPRISE_DATA_FOLDER=/srv/newsblur/docker/volumes/suprise
haproxy:
container_name: haproxy

View file

@ -811,6 +811,8 @@ REDIS_PUBSUB_POOL = redis.ConnectionPool(host=REDIS_PUBSUB['host'], por
# celeryapp.autodiscover_tasks(INSTALLED_APPS)
accept_content = ['pickle', 'json', 'msgpack', 'yaml']
SURPRISE_DATA_FOLDER = os.getenv("SURPRISE_DATA_FOLDER", "/srv/newsblur/docker/volumes/surprise")
# ==========
# = Assets =
# ==========