mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
Sending 403'd sites over to scrapeninja.
This commit is contained in:
parent
16f1d4daf6
commit
6556d7c789
7 changed files with 98 additions and 12 deletions
19
apps/profile/migrations/0017_auto_20250305_0552.py
Normal file
19
apps/profile/migrations/0017_auto_20250305_0552.py
Normal file
File diff suppressed because one or more lines are too long
23
apps/rss_feeds/migrations/0011_auto_20250305_0552.py
Normal file
23
apps/rss_feeds/migrations/0011_auto_20250305_0552.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# Generated by Django 3.1.10 on 2025-03-05 05:52
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('rss_feeds', '0010_feed_discover_indexed'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='feed',
|
||||||
|
name='date_forbidden',
|
||||||
|
field=models.DateTimeField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='feed',
|
||||||
|
name='is_forbidden',
|
||||||
|
field=models.BooleanField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
]
|
|
@ -118,6 +118,8 @@ class Feed(models.Model):
|
||||||
similar_feeds = models.ManyToManyField(
|
similar_feeds = models.ManyToManyField(
|
||||||
"self", related_name="feeds_by_similarity", symmetrical=False, blank=True
|
"self", related_name="feeds_by_similarity", symmetrical=False, blank=True
|
||||||
)
|
)
|
||||||
|
is_forbidden = models.BooleanField(blank=True, null=True)
|
||||||
|
date_forbidden = models.DateTimeField(blank=True, null=True)
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
db_table = "feeds"
|
db_table = "feeds"
|
||||||
|
|
|
@ -24,17 +24,7 @@ RUN set -ex \
|
||||||
' \
|
' \
|
||||||
&& apt-get update || (echo "Retrying apt-get update with different DNS" && echo "nameserver 8.8.8.8" > /etc/resolv.conf && apt-get update) \
|
&& apt-get update || (echo "Retrying apt-get update with different DNS" && echo "nameserver 8.8.8.8" > /etc/resolv.conf && apt-get update) \
|
||||||
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends || (echo "Retrying apt-get install with different package names" && apt-get install -y libpq5 libjpeg62-turbo libxslt1.1 patch gfortran libblas-dev libffi-dev libjpeg-dev libpq-dev libev-dev libreadline-dev liblapack-dev libxml2-dev libxslt1-dev libncurses-dev zlib1g-dev --no-install-recommends) \
|
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends || (echo "Retrying apt-get install with different package names" && apt-get install -y libpq5 libjpeg62-turbo libxslt1.1 patch gfortran libblas-dev libffi-dev libjpeg-dev libpq-dev libev-dev libreadline-dev liblapack-dev libxml2-dev libxslt1-dev libncurses-dev zlib1g-dev --no-install-recommends) \
|
||||||
&& apt-get install -y wget curl ca-certificates \
|
&& apt-get install -y wget curl ca-certificates
|
||||||
&& ARCH=$(uname -m) \
|
|
||||||
&& if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \
|
|
||||||
CURL_IMPERSONATE_URL="https://github.com/lexiforest/curl-impersonate/releases/download/v0.9.3/curl-impersonate-v0.9.3.aarch64-linux-gnu.tar.gz"; \
|
|
||||||
else \
|
|
||||||
CURL_IMPERSONATE_URL="https://github.com/lexiforest/curl-impersonate/releases/download/v0.9.3/curl-impersonate-v0.9.3.x86_64-linux-gnu.tar.gz"; \
|
|
||||||
fi \
|
|
||||||
&& wget $CURL_IMPERSONATE_URL \
|
|
||||||
&& tar -xzf curl-impersonate-*.tar.gz -C /usr/local/bin/ \
|
|
||||||
&& rm curl-impersonate-*.tar.gz \
|
|
||||||
&& chmod +x /usr/local/bin/curl-impersonate-chrome
|
|
||||||
COPY config/requirements.txt /srv/newsblur/
|
COPY config/requirements.txt /srv/newsblur/
|
||||||
|
|
||||||
# Install Rust (required for tiktoken)
|
# Install Rust (required for tiktoken)
|
||||||
|
|
|
@ -11998,7 +11998,7 @@ form.opml_import_form input {
|
||||||
}
|
}
|
||||||
|
|
||||||
.NB-modal-exception .NB-modal-submit-button {
|
.NB-modal-exception .NB-modal-submit-button {
|
||||||
float: left;
|
display: inline-block;
|
||||||
}
|
}
|
||||||
|
|
||||||
.NB-modal-exception .NB-exception-submit-wrapper {
|
.NB-modal-exception .NB-exception-submit-wrapper {
|
||||||
|
|
|
@ -547,6 +547,7 @@ FACEBOOK_NAMESPACE = "newsblur"
|
||||||
TWITTER_CONSUMER_KEY = "ooooooooooooooooooooo"
|
TWITTER_CONSUMER_KEY = "ooooooooooooooooooooo"
|
||||||
TWITTER_CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
TWITTER_CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||||
YOUTUBE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
YOUTUBE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||||
|
SCRAPENINJA_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||||
|
|
||||||
# ===============
|
# ===============
|
||||||
# = AWS Backing =
|
# = AWS Backing =
|
||||||
|
|
|
@ -169,6 +169,14 @@ class FetchFeed:
|
||||||
)
|
)
|
||||||
return FEED_ERRHTTP, None
|
return FEED_ERRHTTP, None
|
||||||
self.fpf = feedparser.parse(facebook_feed)
|
self.fpf = feedparser.parse(facebook_feed)
|
||||||
|
elif self.feed.is_forbidden:
|
||||||
|
forbidden_feed = self.fetch_forbidden()
|
||||||
|
if not forbidden_feed:
|
||||||
|
logging.debug(
|
||||||
|
" ***> [%-30s] ~FRForbidden feed fetch failed: %s" % (self.feed.log_title[:30], address)
|
||||||
|
)
|
||||||
|
return FEED_ERRHTTP, None
|
||||||
|
self.fpf = feedparser.parse(forbidden_feed)
|
||||||
|
|
||||||
if not self.fpf and "json" in address:
|
if not self.fpf and "json" in address:
|
||||||
try:
|
try:
|
||||||
|
@ -334,6 +342,45 @@ class FetchFeed:
|
||||||
youtube_fetcher = YoutubeFetcher(self.feed, self.options)
|
youtube_fetcher = YoutubeFetcher(self.feed, self.options)
|
||||||
return youtube_fetcher.fetch()
|
return youtube_fetcher.fetch()
|
||||||
|
|
||||||
|
def fetch_forbidden(self, js_scrape=False):
|
||||||
|
url = "https://scrapeninja.p.rapidapi.com/scrape"
|
||||||
|
if js_scrape:
|
||||||
|
url = "https://scrapeninja.p.rapidapi.com/scrape-js"
|
||||||
|
|
||||||
|
payload = {"url": self.feed.feed_address}
|
||||||
|
headers = {
|
||||||
|
"x-rapidapi-key": settings.SCRAPENINJA_API_KEY,
|
||||||
|
"x-rapidapi-host": "scrapeninja.p.rapidapi.com",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
logging.debug(
|
||||||
|
" ***> [%-30s] ~FRForbidden feed fetch: %s -> %s" % (self.feed.log_title[:30], url, payload)
|
||||||
|
)
|
||||||
|
response = requests.post(url, json=payload, headers=headers)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logging.debug(
|
||||||
|
" ***> [%-30s] ~FRForbidden feed fetch failed: %s -> %s"
|
||||||
|
% (self.feed.log_title[:30], url, payload)
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
body = response.json().get("body")
|
||||||
|
if not body:
|
||||||
|
logging.debug(
|
||||||
|
" ***> [%-30s] ~FRForbidden feed fetch failed: %s -> %s"
|
||||||
|
% (self.feed.log_title[:30], url, response.json())
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if "enable JS" in body and not js_scrape:
|
||||||
|
return self.fetch_forbidden(js_scrape=True)
|
||||||
|
|
||||||
|
logging.debug(
|
||||||
|
" ***> [%-30s] ~FRForbidden feed fetch succeeded: %s -> %s"
|
||||||
|
% (self.feed.log_title[:30], url, body)
|
||||||
|
)
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
class ProcessFeed:
|
class ProcessFeed:
|
||||||
def __init__(self, feed_id, fpf, options, raw_feed=None):
|
def __init__(self, feed_id, fpf, options, raw_feed=None):
|
||||||
|
@ -558,6 +605,10 @@ class ProcessFeed:
|
||||||
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
|
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
|
||||||
% (self.feed.log_title[:30], self.fpf.status)
|
% (self.feed.log_title[:30], self.fpf.status)
|
||||||
)
|
)
|
||||||
|
if self.fpf.status == 403 and not self.feed.is_forbidden:
|
||||||
|
self.feed.is_forbidden = True
|
||||||
|
self.feed.date_forbidden = datetime.datetime.now()
|
||||||
|
self.feed = self.feed.save()
|
||||||
fixed_feed = None
|
fixed_feed = None
|
||||||
if not self.feed.known_good:
|
if not self.feed.known_good:
|
||||||
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
|
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
|
||||||
|
|
Loading…
Add table
Reference in a new issue