Sending 403'd sites over to scrapeninja.

This commit is contained in:
Samuel Clay 2025-03-04 22:25:04 -08:00
parent 16f1d4daf6
commit 6556d7c789
7 changed files with 98 additions and 12 deletions

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,23 @@
# Generated by Django 3.1.10 on 2025-03-05 05:52
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('rss_feeds', '0010_feed_discover_indexed'),
]
operations = [
migrations.AddField(
model_name='feed',
name='date_forbidden',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='feed',
name='is_forbidden',
field=models.BooleanField(blank=True, null=True),
),
]

View file

@ -118,6 +118,8 @@ class Feed(models.Model):
similar_feeds = models.ManyToManyField( similar_feeds = models.ManyToManyField(
"self", related_name="feeds_by_similarity", symmetrical=False, blank=True "self", related_name="feeds_by_similarity", symmetrical=False, blank=True
) )
is_forbidden = models.BooleanField(blank=True, null=True)
date_forbidden = models.DateTimeField(blank=True, null=True)
class Meta: class Meta:
db_table = "feeds" db_table = "feeds"

View file

@ -24,17 +24,7 @@ RUN set -ex \
' \ ' \
&& apt-get update || (echo "Retrying apt-get update with different DNS" && echo "nameserver 8.8.8.8" > /etc/resolv.conf && apt-get update) \ && apt-get update || (echo "Retrying apt-get update with different DNS" && echo "nameserver 8.8.8.8" > /etc/resolv.conf && apt-get update) \
&& apt-get install -y $rundDeps $buildDeps --no-install-recommends || (echo "Retrying apt-get install with different package names" && apt-get install -y libpq5 libjpeg62-turbo libxslt1.1 patch gfortran libblas-dev libffi-dev libjpeg-dev libpq-dev libev-dev libreadline-dev liblapack-dev libxml2-dev libxslt1-dev libncurses-dev zlib1g-dev --no-install-recommends) \ && apt-get install -y $rundDeps $buildDeps --no-install-recommends || (echo "Retrying apt-get install with different package names" && apt-get install -y libpq5 libjpeg62-turbo libxslt1.1 patch gfortran libblas-dev libffi-dev libjpeg-dev libpq-dev libev-dev libreadline-dev liblapack-dev libxml2-dev libxslt1-dev libncurses-dev zlib1g-dev --no-install-recommends) \
&& apt-get install -y wget curl ca-certificates \ && apt-get install -y wget curl ca-certificates
&& ARCH=$(uname -m) \
&& if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \
CURL_IMPERSONATE_URL="https://github.com/lexiforest/curl-impersonate/releases/download/v0.9.3/curl-impersonate-v0.9.3.aarch64-linux-gnu.tar.gz"; \
else \
CURL_IMPERSONATE_URL="https://github.com/lexiforest/curl-impersonate/releases/download/v0.9.3/curl-impersonate-v0.9.3.x86_64-linux-gnu.tar.gz"; \
fi \
&& wget $CURL_IMPERSONATE_URL \
&& tar -xzf curl-impersonate-*.tar.gz -C /usr/local/bin/ \
&& rm curl-impersonate-*.tar.gz \
&& chmod +x /usr/local/bin/curl-impersonate-chrome
COPY config/requirements.txt /srv/newsblur/ COPY config/requirements.txt /srv/newsblur/
# Install Rust (required for tiktoken) # Install Rust (required for tiktoken)

View file

@ -11998,7 +11998,7 @@ form.opml_import_form input {
} }
.NB-modal-exception .NB-modal-submit-button { .NB-modal-exception .NB-modal-submit-button {
float: left; display: inline-block;
} }
.NB-modal-exception .NB-exception-submit-wrapper { .NB-modal-exception .NB-exception-submit-wrapper {

View file

@ -547,6 +547,7 @@ FACEBOOK_NAMESPACE = "newsblur"
TWITTER_CONSUMER_KEY = "ooooooooooooooooooooo" TWITTER_CONSUMER_KEY = "ooooooooooooooooooooo"
TWITTER_CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" TWITTER_CONSUMER_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
YOUTUBE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" YOUTUBE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
SCRAPENINJA_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# =============== # ===============
# = AWS Backing = # = AWS Backing =

View file

@ -169,6 +169,14 @@ class FetchFeed:
) )
return FEED_ERRHTTP, None return FEED_ERRHTTP, None
self.fpf = feedparser.parse(facebook_feed) self.fpf = feedparser.parse(facebook_feed)
elif self.feed.is_forbidden:
forbidden_feed = self.fetch_forbidden()
if not forbidden_feed:
logging.debug(
" ***> [%-30s] ~FRForbidden feed fetch failed: %s" % (self.feed.log_title[:30], address)
)
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(forbidden_feed)
if not self.fpf and "json" in address: if not self.fpf and "json" in address:
try: try:
@ -334,6 +342,45 @@ class FetchFeed:
youtube_fetcher = YoutubeFetcher(self.feed, self.options) youtube_fetcher = YoutubeFetcher(self.feed, self.options)
return youtube_fetcher.fetch() return youtube_fetcher.fetch()
def fetch_forbidden(self, js_scrape=False):
url = "https://scrapeninja.p.rapidapi.com/scrape"
if js_scrape:
url = "https://scrapeninja.p.rapidapi.com/scrape-js"
payload = {"url": self.feed.feed_address}
headers = {
"x-rapidapi-key": settings.SCRAPENINJA_API_KEY,
"x-rapidapi-host": "scrapeninja.p.rapidapi.com",
"Content-Type": "application/json",
}
logging.debug(
" ***> [%-30s] ~FRForbidden feed fetch: %s -> %s" % (self.feed.log_title[:30], url, payload)
)
response = requests.post(url, json=payload, headers=headers)
if response.status_code != 200:
logging.debug(
" ***> [%-30s] ~FRForbidden feed fetch failed: %s -> %s"
% (self.feed.log_title[:30], url, payload)
)
return None
body = response.json().get("body")
if not body:
logging.debug(
" ***> [%-30s] ~FRForbidden feed fetch failed: %s -> %s"
% (self.feed.log_title[:30], url, response.json())
)
return None
if "enable JS" in body and not js_scrape:
return self.fetch_forbidden(js_scrape=True)
logging.debug(
" ***> [%-30s] ~FRForbidden feed fetch succeeded: %s -> %s"
% (self.feed.log_title[:30], url, body)
)
return body
class ProcessFeed: class ProcessFeed:
def __init__(self, feed_id, fpf, options, raw_feed=None): def __init__(self, feed_id, fpf, options, raw_feed=None):
@ -558,6 +605,10 @@ class ProcessFeed:
" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
% (self.feed.log_title[:30], self.fpf.status) % (self.feed.log_title[:30], self.fpf.status)
) )
if self.fpf.status == 403 and not self.feed.is_forbidden:
self.feed.is_forbidden = True
self.feed.date_forbidden = datetime.datetime.now()
self.feed = self.feed.save()
fixed_feed = None fixed_feed = None
if not self.feed.known_good: if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address() fixed_feed, feed = self.feed.check_feed_link_for_feed_address()