Updating youtube fetcher to use channels/playlists/users for everything, no longer relying on RSS/xml url.

This commit is contained in:
Samuel Clay 2024-03-03 12:59:12 -05:00
parent ec44039cc9
commit a5222d9675
4 changed files with 242 additions and 198 deletions

34
.vscode/settings.json vendored
View file

@ -1,23 +1,16 @@
{
"black-formatter.args": [
"--line-length 110"
],
"isort.args": [
"--profile",
"black"
],
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit"
},
"python.linting.enabled": true,
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.linting.pylamaEnabled": false,
"python.linting.flake8Args": [
"--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
],
"python.pythonPath": "~/.virtualenvs/newsblur3/bin/python",
// "python.linting.enabled": true,
// "python.linting.pylintEnabled": false,
// "python.linting.flake8Enabled": true,
// "python.linting.pylamaEnabled": false,
// "python.linting.flake8Args": [
// "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
// ],
// "python.pythonPath": "~/.virtualenvs/newsblur/bin/python",
"editor.bracketPairColorization.enabled": true,
"editor.guides.bracketPairs": "active",
"git.ignoreLimitWarning": true,
@ -38,15 +31,12 @@
"docker/volumes": true,
"requirements.txt": true, // It's just a symlink to config/requirements.txt, which has git history
},
"python.formatting.blackArgs": [
"--line-length=110",
"--skip-string-normalization"
],
// "python.formatting.blackArgs": [
// "--line-length=110",
// "--skip-string-normalization"
// ],
"files.associations": {
"*.yml": "ansible"
},
"nrf-connect.toolchain.path": "${nrf-connect.toolchain:1.9.1}",
"C_Cpp.default.configurationProvider": "nrf-connect",
"editor.formatOnSave": false,
"ansible.python.interpreterPath": "/opt/homebrew/bin/python3",
}

View file

@ -8,11 +8,11 @@ billiard==3.6.4.0
bleach==3.2.1
boto3==1.18.12
botocore==1.21.12
black~=23.1.0
celery==4.4.7
certifi==2020.12.5
cffi==1.14.5
chardet==3.0.4
click==7.1.2
ConfigArgParse==1.4
cryptography==3.4.7
cssutils==1.0.2
@ -40,13 +40,10 @@ factory-boy==3.2.0
Faker==8.8.2
feedparser>=6,<7
filelock==3.0.12
Flask==1.1.2
Flask==3.0.2
Flask-BasicAuth==0.2.0
future==0.18.2
gevent==21.1.2
geventhttpclient==1.4.4
greenlet==1.1.0
gunicorn==20.1.0
gunicorn==21.2.0
h2==2.6.2
hiredis==1.1.0
hpack==3.0.0
@ -57,24 +54,21 @@ idna==2.10
image==1.5.33
iniconfig==1.1.1
isodate==0.6.0
itsdangerous==1.1.0
Jinja2==2.11.3
Jinja2==3.1.3
jmespath==0.10.0
jsonpickle==2.0.0
kombu==4.6.11
locust==1.4.3
lxml==4.6.2
MarkupSafe==1.1.1
lxml==5.1.0
mock==4.0.2
mongoengine==0.21.0
msgpack==1.0.2
ndg-httpsclient==0.5.1
nose==1.3.7
nose-exclude==0.5.0
numpy==1.19.4
numpy==1.26.4
oauth2==1.9.0.post1
oauthlib==3.1.0
packaging==20.9
paypalrestsdk==1.13.1
pbr==5.6.0
Pillow==8.0.1
@ -104,7 +98,7 @@ raven==6.10.0
redis>=4,<5
requests==2.25.0
requests-oauthlib==1.3.0
scipy==1.5.4
scipy==1.12.0
sentry-sdk>=1,<2
sgmllib3k==1.0.0
simplejson==3.17.2
@ -125,7 +119,6 @@ virtualenv==20.4.6
virtualenv-clone==0.5.4
virtualenvwrapper==4.8.4
webencodings==0.5.1
Werkzeug==1.0.1
XlsxWriter==1.3.7
zope.event==4.5.0
zope.interface==5.4.0

View file

@ -19,9 +19,7 @@ import random
import re
import xml.sax
import dateutil.parser
import feedparser
import isodate
import pymongo
import redis
import requests
@ -58,6 +56,7 @@ from utils.feed_functions import TimeoutError, timelimit
from utils.json_fetcher import JSONFetcher
from utils.story_functions import linkify, pre_process_story, strip_tags
from utils.twitter_fetcher import TwitterFetcher
from utils.youtube_fetcher import YoutubeFetcher
# from utils.feed_functions import mail_feed_error_to_admin
@ -131,10 +130,7 @@ class FetchFeed:
return FEED_OK, self.fpf
if 'youtube.com' in address:
try:
youtube_feed = self.fetch_youtube(address)
except (requests.adapters.ConnectionError):
youtube_feed = None
youtube_feed = self.fetch_youtube()
if not youtube_feed:
logging.debug(
' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)
@ -313,162 +309,9 @@ class FetchFeed:
json_fetcher = JSONFetcher(self.feed, self.options)
return json_fetcher.fetch(address, headers)
def fetch_youtube(self, address):
username = None
channel_id = None
list_id = None
if 'gdata.youtube.com' in address:
try:
username_groups = re.search(r'gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
if not username_groups:
return
username = username_groups.group(1)
except IndexError:
return
elif 'youtube.com/@' in address:
try:
username = address.split('youtube.com/@')[1]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?user=' in address:
try:
username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
try:
channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0]
except (IndexError, KeyError):
return
elif 'youtube.com/playlist' in address:
try:
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
try:
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0]
except IndexError:
return
if channel_id:
video_ids_xml = requests.get(
"https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id
)
channel_json = requests.get(
"https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s"
% (channel_id, settings.YOUTUBE_API_KEY)
)
channel = json.decode(channel_json.content)
try:
username = channel['items'][0]['snippet']['title']
description = channel['items'][0]['snippet']['description']
except (IndexError, KeyError):
return
elif list_id:
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
username = playlist['items'][0]['snippet']['title']
description = playlist['items'][0]['snippet']['description']
except (IndexError, KeyError):
return
channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
elif username:
video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username)
description = "YouTube videos uploaded by %s" % username
else:
return
if list_id:
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
except (IndexError, KeyError):
return
else:
if video_ids_xml.status_code != 200:
return
video_ids_soup = BeautifulSoup(video_ids_xml.content, features="lxml")
channel_url = video_ids_soup.find('author').find('uri').getText()
video_ids = []
for video_id in video_ids_soup.findAll('yt:videoid'):
video_ids.append(video_id.getText())
videos_json = requests.get(
"https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
% (','.join(video_ids), settings.YOUTUBE_API_KEY)
)
videos = json.decode(videos_json.content)
if 'error' in videos:
logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
return
data = {}
data['title'] = "%s's YouTube Videos" % username if 'Uploads' not in username else username
data['link'] = channel_url
data['description'] = description
data['lastBuildDate'] = datetime.datetime.utcnow()
data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
data['docs'] = None
data['feed_url'] = address
rss = feedgenerator.Atom1Feed(**data)
for video in videos['items']:
thumbnail = video['snippet']['thumbnails'].get('maxres')
if not thumbnail:
thumbnail = video['snippet']['thumbnails'].get('high')
if not thumbnail:
thumbnail = video['snippet']['thumbnails'].get('medium')
duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
duration_min, seconds = divmod(duration_sec, 60)
hours, minutes = divmod(duration_min, 60)
if hours >= 1:
duration = "%s:%s:%s" % (
hours,
'{0:02d}'.format(minutes),
'{0:02d}'.format(seconds),
)
else:
duration = "%s:%s" % (minutes, '{0:02d}'.format(seconds))
content = """<div class="NB-youtube-player">
<iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
</div>
<div class="NB-youtube-stats"><small>
<b>From:</b> <a href="%s">%s</a><br />
<b>Duration:</b> %s<br />
</small></div><hr>
<div class="NB-youtube-description">%s</div>
<img src="%s" style="display:none" />""" % (
("https://www.youtube.com/embed/" + video['id']),
channel_url,
username,
duration,
linkify(linebreaks(video['snippet']['description'])),
thumbnail['url'] if thumbnail else "",
)
link = "http://www.youtube.com/watch?v=%s" % video['id']
story_data = {
'title': video['snippet']['title'],
'link': link,
'description': content,
'author_name': username,
'categories': [],
'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
}
rss.add_item(**story_data)
return rss.writeString('utf-8')
def fetch_youtube(self):
youtube_fetcher = YoutubeFetcher(self.feed, self.options)
return youtube_fetcher.fetch()
class ProcessFeed:

218
utils/youtube_fetcher.py Normal file
View file

@ -0,0 +1,218 @@
import datetime
from utils import json_functions as json
import isodate
import re
import urllib.error
import urllib.parse
import urllib.request
import dateutil.parser
import requests
from django.conf import settings
from django.utils import feedgenerator
from django.utils.html import linebreaks
from apps.reader.models import UserSubscription
from apps.social.models import MSocialServices
from utils.story_functions import linkify
from utils import log as logging
class YoutubeFetcher:
def __init__(self, feed, options=None):
self.feed = feed
self.options = options or {}
self.address = self.feed.feed_address
def fetch(self):
username = self.extract_username(self.address)
channel_id = self.extract_channel_id(self.address)
list_id = self.extract_list_id(self.address)
video_ids = None
if channel_id:
video_ids, title, description = self.fetch_channel_videos(channel_id)
channel_url = "https://www.youtube.com/channel/%s" % channel_id
elif list_id:
video_ids, title, description = self.fetch_playlist_videos(list_id)
channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
elif username:
video_ids, title, description = self.fetch_user_videos(username)
channel_url = "https://www.youtube.com/user/%s" % username
if not video_ids:
return
videos = self.fetch_videos(video_ids)
data = {}
if username:
data["title"] = f"{username}'s YouTube Videos"
else:
data["title"] = title
data["link"] = channel_url
data["description"] = description
data["lastBuildDate"] = datetime.datetime.utcnow()
data["generator"] = "NewsBlur YouTube API v3 Decrapifier - %s" % settings.NEWSBLUR_URL
data["docs"] = None
data["feed_url"] = self.address
rss = feedgenerator.Atom1Feed(**data)
for video in videos["items"]:
thumbnail = video["snippet"]["thumbnails"].get("maxres")
if not thumbnail:
thumbnail = video["snippet"]["thumbnails"].get("high")
if not thumbnail:
thumbnail = video["snippet"]["thumbnails"].get("medium")
duration_sec = isodate.parse_duration(video["contentDetails"]["duration"]).seconds
duration_min, seconds = divmod(duration_sec, 60)
hours, minutes = divmod(duration_min, 60)
if hours >= 1:
duration = "%s:%s:%s" % (
hours,
"{0:02d}".format(minutes),
"{0:02d}".format(seconds),
)
else:
duration = "%s:%s" % (minutes, "{0:02d}".format(seconds))
content = """<div class="NB-youtube-player">
<iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
</div>
<div class="NB-youtube-stats"><small>
<b>From:</b> <a href="%s">%s</a><br />
<b>Duration:</b> %s<br />
</small></div><hr>
<div class="NB-youtube-description">%s</div>
<img src="%s" style="display:none" />""" % (
("https://www.youtube.com/embed/" + video["id"]),
channel_url,
username,
duration,
linkify(linebreaks(video["snippet"]["description"])),
thumbnail["url"] if thumbnail else "",
)
link = "http://www.youtube.com/watch?v=%s" % video["id"]
story_data = {
"title": video["snippet"]["title"],
"link": link,
"description": content,
"author_name": username,
"categories": [],
"unique_id": "tag:youtube.com,2008:video:%s" % video["id"],
"pubdate": dateutil.parser.parse(video["snippet"]["publishedAt"]),
}
rss.add_item(**story_data)
return rss.writeString("utf-8")
def extract_username(self, url):
if "gdata.youtube.com" in url:
try:
username_groups = re.search(r"gdata.youtube.com/feeds/\w+/users/(\w+)/", url)
if not username_groups:
return
return username_groups.group(1)
except IndexError:
return
elif "youtube.com/@" in url:
try:
return url.split("youtube.com/@")[1]
except IndexError:
return
elif "youtube.com/feeds/videos.xml?user=" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["user"][0]
except IndexError:
return
elif "youtube.com/user/" in url:
username = re.findall(r"youtube.com/user/([^/]+)", url)
if username:
return username[0]
def extract_channel_id(self, url):
if "youtube.com/feeds/videos.xml?channel_id=" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["channel_id"][0]
except (IndexError, KeyError):
return
def extract_list_id(self, url):
if "youtube.com/playlist" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["list"][0]
except IndexError:
return
elif "youtube.com/feeds/videos.xml?playlist_id" in url:
try:
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["playlist_id"][0]
except IndexError:
return
def fetch_videos(self, video_ids):
videos_json = requests.get(
"https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
% (",".join(video_ids), settings.YOUTUBE_API_KEY)
)
videos = json.decode(videos_json.content)
if "error" in videos:
logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
return
return videos
def fetch_channel_videos(self, channel_id):
channel_json = requests.get(
"https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&id=%s&key=%s"
% (channel_id, settings.YOUTUBE_API_KEY)
)
channel = json.decode(channel_json.content)
try:
title = channel["items"][0]["snippet"]["title"]
description = channel["items"][0]["snippet"]["description"]
uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
except (IndexError, KeyError):
return
return self.fetch_playlist_videos(uploads_list_id, title, description)
def fetch_playlist_videos(self, list_id, title=None, description=None):
if not title and not description:
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
title = playlist["items"][0]["snippet"]["title"]
description = playlist["items"][0]["snippet"]["description"]
except (IndexError, KeyError):
return None, None, None
playlist_json = requests.get(
"https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
% (list_id, settings.YOUTUBE_API_KEY)
)
playlist = json.decode(playlist_json.content)
try:
video_ids = [video["snippet"]["resourceId"]["videoId"] for video in playlist["items"]]
except (IndexError, KeyError):
return None, None, None
return video_ids, title, description
def fetch_user_videos(self, username, username_key="forUsername"):
channel_json = requests.get(
"https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&%s=%s&key=%s"
% (username_key, username, settings.YOUTUBE_API_KEY)
)
channel = json.decode(channel_json.content)
try:
title = channel["items"][0]["snippet"]["title"]
description = channel["items"][0]["snippet"]["description"]
uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
except (IndexError, KeyError):
return
if not uploads_list_id and username_key == "forUsername":
return self.fetch_user_videos(username, username_key="forHandle")
return self.fetch_playlist_videos(uploads_list_id, title, description)