mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Adding debug_index
This commit is contained in:
parent
74f0f25711
commit
a45170f0dc
2 changed files with 71 additions and 2 deletions
|
@ -533,27 +533,32 @@ class Feed(models.Model):
|
|||
.filter(**criteria("feed_address", address))
|
||||
.order_by("-num_subscribers")
|
||||
)
|
||||
logging.debug(f" ---> Feeds found by address: {feed}")
|
||||
if not feed:
|
||||
duplicate_feed = DuplicateFeed.objects.filter(**criteria("duplicate_address", address))
|
||||
if duplicate_feed and len(duplicate_feed) > offset:
|
||||
feed = [duplicate_feed[offset].feed]
|
||||
logging.debug(f" ---> Feeds found by duplicate address: {duplicate_feed} {feed} (offset: {offset})")
|
||||
if not feed and aggressive:
|
||||
feed = (
|
||||
cls.objects.filter(branch_from_feed=None)
|
||||
.filter(**criteria("feed_link", address))
|
||||
.order_by("-num_subscribers")
|
||||
)
|
||||
|
||||
logging.debug(f" ---> Feeds found by link: {feed}")
|
||||
|
||||
return feed
|
||||
|
||||
@timelimit(10)
|
||||
def _feedfinder_forman(url):
|
||||
found_feed_urls = feedfinder_forman.find_feeds(url)
|
||||
logging.debug(f" ---> Feeds found by forman: {found_feed_urls}")
|
||||
return found_feed_urls
|
||||
|
||||
@timelimit(10)
|
||||
def _feedfinder_pilgrim(url):
|
||||
found_feed_urls = feedfinder_pilgrim.feeds(url)
|
||||
logging.debug(f" ---> Feeds found by pilgrim: {found_feed_urls}")
|
||||
return found_feed_urls
|
||||
|
||||
# Normalize and check for feed_address, dupes, and feed_link
|
||||
|
@ -3125,7 +3130,6 @@ class MStory(mongo.Document):
|
|||
story_hash=self.story_hash,
|
||||
story_feed_id=self.story_feed_id,
|
||||
story_date=self.story_date,
|
||||
story_content_vector=DiscoverStory.generate_story_content_vector(self.story_hash),
|
||||
)
|
||||
|
||||
def remove_from_search_index(self):
|
||||
|
|
|
@ -576,7 +576,21 @@ class DiscoverStory:
|
|||
"content_vector": story_content_vector,
|
||||
}
|
||||
try:
|
||||
record = cls.ES().get(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type())
|
||||
# Check if the content vector has changed
|
||||
if record and record["_source"]["content_vector"] != story_content_vector:
|
||||
cls.ES().update(
|
||||
index=cls.index_name(),
|
||||
id=story_hash,
|
||||
body={"doc": doc}, # Wrap the document in a "doc" field for updates
|
||||
doc_type=cls.doc_type(),
|
||||
)
|
||||
logging.debug(f" ---> ~FBStory already indexed, new content vector: {story_hash}")
|
||||
else:
|
||||
logging.debug(f" ---> ~FBStory already indexed, no change: {story_hash}")
|
||||
except elasticsearch.exceptions.NotFoundError:
|
||||
cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
|
||||
logging.debug(f" ---> ~FCIndexing discover story: {story_hash}")
|
||||
except (elasticsearch.exceptions.ConnectionError, urllib3.exceptions.NewConnectionError) as e:
|
||||
logging.debug(f" ***> ~FRNo search server available for discover story indexing: {e}")
|
||||
except elasticsearch.exceptions.ConflictError as e:
|
||||
|
@ -735,6 +749,57 @@ class DiscoverStory:
|
|||
|
||||
return story_embedding
|
||||
|
||||
@classmethod
|
||||
def debug_index(cls, show_data=True):
|
||||
"""Debug method to inspect index fields and entries.
|
||||
|
||||
Args:
|
||||
show_data: If True, will show sample documents. Defaults to False to avoid large outputs.
|
||||
"""
|
||||
try:
|
||||
# Check if index exists
|
||||
if not cls.ES().indices.exists(cls.index_name()):
|
||||
logging.info(f"~FR Index {cls.index_name()} does not exist")
|
||||
return
|
||||
|
||||
# Get index mapping
|
||||
mapping = cls.ES().indices.get_mapping(index=cls.index_name())
|
||||
logging.info(f"~FB Index mapping for {cls.index_name()}:")
|
||||
logging.info(
|
||||
f"Properties: {list(mapping[cls.index_name()]['mappings'].get('properties', {}).keys())}"
|
||||
)
|
||||
logging.info(f"Full mapping: {mapping}")
|
||||
|
||||
# Get index settings
|
||||
settings = cls.ES().indices.get_settings(index=cls.index_name())
|
||||
logging.info(f"~FB Index settings:")
|
||||
logging.info(settings)
|
||||
|
||||
# Get index stats
|
||||
stats = cls.ES().indices.stats(index=cls.index_name())
|
||||
total_docs = stats["indices"][cls.index_name()]["total"]["docs"]["count"]
|
||||
logging.info(f"~FG Total documents in index: {total_docs}")
|
||||
|
||||
if show_data:
|
||||
# Sample some documents
|
||||
body = {
|
||||
"query": {"match_all": {}},
|
||||
"size": 3, # Limit to 3 documents for sample
|
||||
"sort": [{"date": {"order": "desc"}}],
|
||||
}
|
||||
results = cls.ES().search(body=body, index=cls.index_name())
|
||||
|
||||
logging.info("~FB Sample documents:")
|
||||
for hit in results["hits"]["hits"]:
|
||||
logging.info(f"Document ID: {hit['_id']}")
|
||||
logging.info(f"Fields: {list(hit.get('_source', {}).keys())}")
|
||||
logging.info("---")
|
||||
|
||||
except elasticsearch.exceptions.NotFoundError as e:
|
||||
logging.info(f"~FR Error accessing index: {e}")
|
||||
except Exception as e:
|
||||
logging.info(f"~FR Unexpected error: {e}")
|
||||
|
||||
|
||||
class SearchFeed:
|
||||
_es_client = None
|
||||
|
|
Loading…
Add table
Reference in a new issue