Updating youtube fetcher to use channels/playlists/users for everything, no longer relying on RSS/xml url.

This commit is contained in:
Samuel Clay 2024-03-03 12:59:12 -05:00
parent ec44039cc9
commit a5222d9675
4 changed files with 242 additions and 198 deletions

34
.vscode/settings.json vendored
View file

@ -1,23 +1,16 @@
{ {
"black-formatter.args": [
"--line-length 110"
],
"isort.args": [ "isort.args": [
"--profile", "--profile",
"black" "black"
], ],
"editor.formatOnSave": true, // "python.linting.enabled": true,
"editor.codeActionsOnSave": { // "python.linting.pylintEnabled": false,
"source.organizeImports": "explicit" // "python.linting.flake8Enabled": true,
}, // "python.linting.pylamaEnabled": false,
"python.linting.enabled": true, // "python.linting.flake8Args": [
"python.linting.pylintEnabled": false, // "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
"python.linting.flake8Enabled": true, // ],
"python.linting.pylamaEnabled": false, // "python.pythonPath": "~/.virtualenvs/newsblur/bin/python",
"python.linting.flake8Args": [
"--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
],
"python.pythonPath": "~/.virtualenvs/newsblur3/bin/python",
"editor.bracketPairColorization.enabled": true, "editor.bracketPairColorization.enabled": true,
"editor.guides.bracketPairs": "active", "editor.guides.bracketPairs": "active",
"git.ignoreLimitWarning": true, "git.ignoreLimitWarning": true,
@ -38,15 +31,12 @@
"docker/volumes": true, "docker/volumes": true,
"requirements.txt": true, // It's just a symlink to config/requirements.txt, which has git history "requirements.txt": true, // It's just a symlink to config/requirements.txt, which has git history
}, },
"python.formatting.blackArgs": [ // "python.formatting.blackArgs": [
"--line-length=110", // "--line-length=110",
"--skip-string-normalization" // "--skip-string-normalization"
], // ],
"files.associations": { "files.associations": {
"*.yml": "ansible" "*.yml": "ansible"
}, },
"nrf-connect.toolchain.path": "${nrf-connect.toolchain:1.9.1}",
"C_Cpp.default.configurationProvider": "nrf-connect",
"editor.formatOnSave": false,
"ansible.python.interpreterPath": "/opt/homebrew/bin/python3", "ansible.python.interpreterPath": "/opt/homebrew/bin/python3",
} }

View file

@ -8,11 +8,11 @@ billiard==3.6.4.0
bleach==3.2.1 bleach==3.2.1
boto3==1.18.12 boto3==1.18.12
botocore==1.21.12 botocore==1.21.12
black~=23.1.0
celery==4.4.7 celery==4.4.7
certifi==2020.12.5 certifi==2020.12.5
cffi==1.14.5 cffi==1.14.5
chardet==3.0.4 chardet==3.0.4
click==7.1.2
ConfigArgParse==1.4 ConfigArgParse==1.4
cryptography==3.4.7 cryptography==3.4.7
cssutils==1.0.2 cssutils==1.0.2
@ -40,13 +40,10 @@ factory-boy==3.2.0
Faker==8.8.2 Faker==8.8.2
feedparser>=6,<7 feedparser>=6,<7
filelock==3.0.12 filelock==3.0.12
Flask==1.1.2 Flask==3.0.2
Flask-BasicAuth==0.2.0 Flask-BasicAuth==0.2.0
future==0.18.2 future==0.18.2
gevent==21.1.2 gunicorn==21.2.0
geventhttpclient==1.4.4
greenlet==1.1.0
gunicorn==20.1.0
h2==2.6.2 h2==2.6.2
hiredis==1.1.0 hiredis==1.1.0
hpack==3.0.0 hpack==3.0.0
@ -57,24 +54,21 @@ idna==2.10
image==1.5.33 image==1.5.33
iniconfig==1.1.1 iniconfig==1.1.1
isodate==0.6.0 isodate==0.6.0
itsdangerous==1.1.0 Jinja2==3.1.3
Jinja2==2.11.3
jmespath==0.10.0 jmespath==0.10.0
jsonpickle==2.0.0 jsonpickle==2.0.0
kombu==4.6.11 kombu==4.6.11
locust==1.4.3 locust==1.4.3
lxml==4.6.2 lxml==5.1.0
MarkupSafe==1.1.1
mock==4.0.2 mock==4.0.2
mongoengine==0.21.0 mongoengine==0.21.0
msgpack==1.0.2 msgpack==1.0.2
ndg-httpsclient==0.5.1 ndg-httpsclient==0.5.1
nose==1.3.7 nose==1.3.7
nose-exclude==0.5.0 nose-exclude==0.5.0
numpy==1.19.4 numpy==1.26.4
oauth2==1.9.0.post1 oauth2==1.9.0.post1
oauthlib==3.1.0 oauthlib==3.1.0
packaging==20.9
paypalrestsdk==1.13.1 paypalrestsdk==1.13.1
pbr==5.6.0 pbr==5.6.0
Pillow==8.0.1 Pillow==8.0.1
@ -104,7 +98,7 @@ raven==6.10.0
redis>=4,<5 redis>=4,<5
requests==2.25.0 requests==2.25.0
requests-oauthlib==1.3.0 requests-oauthlib==1.3.0
scipy==1.5.4 scipy==1.12.0
sentry-sdk>=1,<2 sentry-sdk>=1,<2
sgmllib3k==1.0.0 sgmllib3k==1.0.0
simplejson==3.17.2 simplejson==3.17.2
@ -125,7 +119,6 @@ virtualenv==20.4.6
virtualenv-clone==0.5.4 virtualenv-clone==0.5.4
virtualenvwrapper==4.8.4 virtualenvwrapper==4.8.4
webencodings==0.5.1 webencodings==0.5.1
Werkzeug==1.0.1
XlsxWriter==1.3.7 XlsxWriter==1.3.7
zope.event==4.5.0 zope.event==4.5.0
zope.interface==5.4.0 zope.interface==5.4.0

View file

@ -19,9 +19,7 @@ import random
import re import re
import xml.sax import xml.sax
import dateutil.parser
import feedparser import feedparser
import isodate
import pymongo import pymongo
import redis import redis
import requests import requests
@ -58,6 +56,7 @@ from utils.feed_functions import TimeoutError, timelimit
from utils.json_fetcher import JSONFetcher from utils.json_fetcher import JSONFetcher
from utils.story_functions import linkify, pre_process_story, strip_tags from utils.story_functions import linkify, pre_process_story, strip_tags
from utils.twitter_fetcher import TwitterFetcher from utils.twitter_fetcher import TwitterFetcher
from utils.youtube_fetcher import YoutubeFetcher
# from utils.feed_functions import mail_feed_error_to_admin # from utils.feed_functions import mail_feed_error_to_admin
@ -131,10 +130,7 @@ class FetchFeed:
return FEED_OK, self.fpf return FEED_OK, self.fpf
if 'youtube.com' in address: if 'youtube.com' in address:
try: youtube_feed = self.fetch_youtube()
youtube_feed = self.fetch_youtube(address)
except (requests.adapters.ConnectionError):
youtube_feed = None
if not youtube_feed: if not youtube_feed:
logging.debug( logging.debug(
' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address) ' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)
@ -313,162 +309,9 @@ class FetchFeed:
json_fetcher = JSONFetcher(self.feed, self.options) json_fetcher = JSONFetcher(self.feed, self.options)
return json_fetcher.fetch(address, headers) return json_fetcher.fetch(address, headers)
def fetch_youtube(self, address): def fetch_youtube(self):
username = None youtube_fetcher = YoutubeFetcher(self.feed, self.options)
channel_id = None return youtube_fetcher.fetch()
list_id = None
if 'gdata.youtube.com' in address:
try:
username_groups = re.search(r'gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
if not username_groups:
return
username = username_groups.group(1)
except IndexError:
return
elif 'youtube.com/@' in address:
try:
username = address.split('youtube.com/@')[1]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?user=' in address:
try:
username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
try:
channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0]
except (IndexError, KeyError):
return
elif 'youtube.com/playlist' in address:
try:
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
try:
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0]
except IndexError:
return
if channel_id:
video_ids_xml = requests.get(
"https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id
)
channel_json = requests.get(
"https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s"
% (channel_id, settings.YOUTUBE_API_KEY)
)
channel = json.decode(channel_json.content)
try:
username = channel['items'][0]['snippet']['title']
description = channel['items'][0]['snippet']['description']
except (IndexError, KeyError):
return
elif list_id:
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
username = playlist['items'][0]['snippet']['title']
description = playlist['items'][0]['snippet']['description']
except (IndexError, KeyError):
return
channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
elif username:
video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username)
description = "YouTube videos uploaded by %s" % username
else:
return
if list_id:
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
except (IndexError, KeyError):
return
else:
if video_ids_xml.status_code != 200:
return
video_ids_soup = BeautifulSoup(video_ids_xml.content, features="lxml")
channel_url = video_ids_soup.find('author').find('uri').getText()
video_ids = []
for video_id in video_ids_soup.findAll('yt:videoid'):
video_ids.append(video_id.getText())
videos_json = requests.get(
"https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
% (','.join(video_ids), settings.YOUTUBE_API_KEY)
)
videos = json.decode(videos_json.content)
if 'error' in videos:
logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
return
data = {}
data['title'] = "%s's YouTube Videos" % username if 'Uploads' not in username else username
data['link'] = channel_url
data['description'] = description
data['lastBuildDate'] = datetime.datetime.utcnow()
data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
data['docs'] = None
data['feed_url'] = address
rss = feedgenerator.Atom1Feed(**data)
for video in videos['items']:
thumbnail = video['snippet']['thumbnails'].get('maxres')
if not thumbnail:
thumbnail = video['snippet']['thumbnails'].get('high')
if not thumbnail:
thumbnail = video['snippet']['thumbnails'].get('medium')
duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
duration_min, seconds = divmod(duration_sec, 60)
hours, minutes = divmod(duration_min, 60)
if hours >= 1:
duration = "%s:%s:%s" % (
hours,
'{0:02d}'.format(minutes),
'{0:02d}'.format(seconds),
)
else:
duration = "%s:%s" % (minutes, '{0:02d}'.format(seconds))
content = """<div class="NB-youtube-player">
<iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
</div>
<div class="NB-youtube-stats"><small>
<b>From:</b> <a href="%s">%s</a><br />
<b>Duration:</b> %s<br />
</small></div><hr>
<div class="NB-youtube-description">%s</div>
<img src="%s" style="display:none" />""" % (
("https://www.youtube.com/embed/" + video['id']),
channel_url,
username,
duration,
linkify(linebreaks(video['snippet']['description'])),
thumbnail['url'] if thumbnail else "",
)
link = "http://www.youtube.com/watch?v=%s" % video['id']
story_data = {
'title': video['snippet']['title'],
'link': link,
'description': content,
'author_name': username,
'categories': [],
'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
}
rss.add_item(**story_data)
return rss.writeString('utf-8')
class ProcessFeed: class ProcessFeed:

218
utils/youtube_fetcher.py Normal file
View file

@ -0,0 +1,218 @@
import datetime
from utils import json_functions as json
import isodate
import re
import urllib.error
import urllib.parse
import urllib.request
import dateutil.parser
import requests
from django.conf import settings
from django.utils import feedgenerator
from django.utils.html import linebreaks
from apps.reader.models import UserSubscription
from apps.social.models import MSocialServices
from utils.story_functions import linkify
from utils import log as logging
class YoutubeFetcher:
def __init__(self, feed, options=None):
self.feed = feed
self.options = options or {}
self.address = self.feed.feed_address
def fetch(self):
username = self.extract_username(self.address)
channel_id = self.extract_channel_id(self.address)
list_id = self.extract_list_id(self.address)
video_ids = None
if channel_id:
video_ids, title, description = self.fetch_channel_videos(channel_id)
channel_url = "https://www.youtube.com/channel/%s" % channel_id
elif list_id:
video_ids, title, description = self.fetch_playlist_videos(list_id)
channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
elif username:
video_ids, title, description = self.fetch_user_videos(username)
channel_url = "https://www.youtube.com/user/%s" % username
if not video_ids:
return
videos = self.fetch_videos(video_ids)
data = {}
if username:
data["title"] = f"{username}'s YouTube Videos"
else:
data["title"] = title
data["link"] = channel_url
data["description"] = description
data["lastBuildDate"] = datetime.datetime.utcnow()
data["generator"] = "NewsBlur YouTube API v3 Decrapifier - %s" % settings.NEWSBLUR_URL
data["docs"] = None
data["feed_url"] = self.address
rss = feedgenerator.Atom1Feed(**data)
for video in videos["items"]:
thumbnail = video["snippet"]["thumbnails"].get("maxres")
if not thumbnail:
thumbnail = video["snippet"]["thumbnails"].get("high")
if not thumbnail:
thumbnail = video["snippet"]["thumbnails"].get("medium")
duration_sec = isodate.parse_duration(video["contentDetails"]["duration"]).seconds
duration_min, seconds = divmod(duration_sec, 60)
hours, minutes = divmod(duration_min, 60)
if hours >= 1:
duration = "%s:%s:%s" % (
hours,
"{0:02d}".format(minutes),
"{0:02d}".format(seconds),
)
else:
duration = "%s:%s" % (minutes, "{0:02d}".format(seconds))
content = """<div class="NB-youtube-player">
<iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
</div>
<div class="NB-youtube-stats"><small>
<b>From:</b> <a href="%s">%s</a><br />
<b>Duration:</b> %s<br />
</small></div><hr>
<div class="NB-youtube-description">%s</div>
<img src="%s" style="display:none" />""" % (
("https://www.youtube.com/embed/" + video["id"]),
channel_url,
username,
duration,
linkify(linebreaks(video["snippet"]["description"])),
thumbnail["url"] if thumbnail else "",
)
link = "http://www.youtube.com/watch?v=%s" % video["id"]
story_data = {
"title": video["snippet"]["title"],
"link": link,
"description": content,
"author_name": username,
"categories": [],
"unique_id": "tag:youtube.com,2008:video:%s" % video["id"],
"pubdate": dateutil.parser.parse(video["snippet"]["publishedAt"]),
}
rss.add_item(**story_data)
return rss.writeString("utf-8")
def extract_username(self, url):
if "gdata.youtube.com" in url:
try:
username_groups = re.search(r"gdata.youtube.com/feeds/\w+/users/(\w+)/", url)
if not username_groups:
return
return username_groups.group(1)
except IndexError:
return
elif "youtube.com/@" in url:
try:
return url.split("youtube.com/@")[1]
except IndexError:
return
elif "youtube.com/feeds/videos.xml?user=" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["user"][0]
except IndexError:
return
elif "youtube.com/user/" in url:
username = re.findall(r"youtube.com/user/([^/]+)", url)
if username:
return username[0]
def extract_channel_id(self, url):
if "youtube.com/feeds/videos.xml?channel_id=" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["channel_id"][0]
except (IndexError, KeyError):
return
def extract_list_id(self, url):
if "youtube.com/playlist" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["list"][0]
except IndexError:
return
elif "youtube.com/feeds/videos.xml?playlist_id" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["playlist_id"][0]
except IndexError:
return
def fetch_videos(self, video_ids):
videos_json = requests.get(
"https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
% (",".join(video_ids), settings.YOUTUBE_API_KEY)
)
videos = json.decode(videos_json.content)
if "error" in videos:
logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
return
return videos
def fetch_channel_videos(self, channel_id):
channel_json = requests.get(
"https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&id=%s&key=%s"
% (channel_id, settings.YOUTUBE_API_KEY)
)
channel = json.decode(channel_json.content)
try:
title = channel["items"][0]["snippet"]["title"]
description = channel["items"][0]["snippet"]["description"]
uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
except (IndexError, KeyError):
return
return self.fetch_playlist_videos(uploads_list_id, title, description)
def fetch_playlist_videos(self, list_id, title=None, description=None):
if not title and not description:
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
title = playlist["items"][0]["snippet"]["title"]
description = playlist["items"][0]["snippet"]["description"]
except (IndexError, KeyError):
return None, None, None
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
video_ids = [video["snippet"]["resourceId"]["videoId"] for video in playlist["items"]]
except (IndexError, KeyError):
return None, None, None
return video_ids, title, description
def fetch_user_videos(self, username, username_key="forUsername"):
channel_json = requests.get(
"https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&%s=%s&key=%s"
% (username_key, username, settings.YOUTUBE_API_KEY)
)
channel = json.decode(channel_json.content)
try:
title = channel["items"][0]["snippet"]["title"]
description = channel["items"][0]["snippet"]["description"]
uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
except (IndexError, KeyError):
return
if not uploads_list_id and username_key == "forUsername":
return self.fetch_user_videos(username, username_key="forHandle")
return self.fetch_playlist_videos(uploads_list_id, title, description)