Adding debug_index

This commit is contained in:
Samuel Clay 2024-11-24 13:20:37 -08:00
parent 74f0f25711
commit a45170f0dc
2 changed files with 71 additions and 2 deletions

View file

@ -533,27 +533,32 @@ class Feed(models.Model):
.filter(**criteria("feed_address", address))
.order_by("-num_subscribers")
)
logging.debug(f" ---> Feeds found by address: {feed}")
if not feed:
duplicate_feed = DuplicateFeed.objects.filter(**criteria("duplicate_address", address))
if duplicate_feed and len(duplicate_feed) > offset:
feed = [duplicate_feed[offset].feed]
logging.debug(f" ---> Feeds found by duplicate address: {duplicate_feed} {feed} (offset: {offset})")
if not feed and aggressive:
feed = (
cls.objects.filter(branch_from_feed=None)
.filter(**criteria("feed_link", address))
.order_by("-num_subscribers")
)
logging.debug(f" ---> Feeds found by link: {feed}")
return feed
@timelimit(10)
def _feedfinder_forman(url):
found_feed_urls = feedfinder_forman.find_feeds(url)
logging.debug(f" ---> Feeds found by forman: {found_feed_urls}")
return found_feed_urls
@timelimit(10)
def _feedfinder_pilgrim(url):
found_feed_urls = feedfinder_pilgrim.feeds(url)
logging.debug(f" ---> Feeds found by pilgrim: {found_feed_urls}")
return found_feed_urls
# Normalize and check for feed_address, dupes, and feed_link
@ -3125,7 +3130,6 @@ class MStory(mongo.Document):
story_hash=self.story_hash,
story_feed_id=self.story_feed_id,
story_date=self.story_date,
story_content_vector=DiscoverStory.generate_story_content_vector(self.story_hash),
)
def remove_from_search_index(self):

View file

@ -576,7 +576,21 @@ class DiscoverStory:
"content_vector": story_content_vector,
}
try:
record = cls.ES().get(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type())
# Check if the content vector has changed
if record and record["_source"]["content_vector"] != story_content_vector:
cls.ES().update(
index=cls.index_name(),
id=story_hash,
body={"doc": doc}, # Wrap the document in a "doc" field for updates
doc_type=cls.doc_type(),
)
logging.debug(f" ---> ~FBStory already indexed, new content vector: {story_hash}")
else:
logging.debug(f" ---> ~FBStory already indexed, no change: {story_hash}")
except elasticsearch.exceptions.NotFoundError:
cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
logging.debug(f" ---> ~FCIndexing discover story: {story_hash}")
except (elasticsearch.exceptions.ConnectionError, urllib3.exceptions.NewConnectionError) as e:
logging.debug(f" ***> ~FRNo search server available for discover story indexing: {e}")
except elasticsearch.exceptions.ConflictError as e:
@ -735,6 +749,57 @@ class DiscoverStory:
return story_embedding
@classmethod
def debug_index(cls, show_data=True):
"""Debug method to inspect index fields and entries.
Args:
show_data: If True, will show sample documents. Defaults to False to avoid large outputs.
"""
try:
# Check if index exists
if not cls.ES().indices.exists(cls.index_name()):
logging.info(f"~FR Index {cls.index_name()} does not exist")
return
# Get index mapping
mapping = cls.ES().indices.get_mapping(index=cls.index_name())
logging.info(f"~FB Index mapping for {cls.index_name()}:")
logging.info(
f"Properties: {list(mapping[cls.index_name()]['mappings'].get('properties', {}).keys())}"
)
logging.info(f"Full mapping: {mapping}")
# Get index settings
settings = cls.ES().indices.get_settings(index=cls.index_name())
logging.info(f"~FB Index settings:")
logging.info(settings)
# Get index stats
stats = cls.ES().indices.stats(index=cls.index_name())
total_docs = stats["indices"][cls.index_name()]["total"]["docs"]["count"]
logging.info(f"~FG Total documents in index: {total_docs}")
if show_data:
# Sample some documents
body = {
"query": {"match_all": {}},
"size": 3, # Limit to 3 documents for sample
"sort": [{"date": {"order": "desc"}}],
}
results = cls.ES().search(body=body, index=cls.index_name())
logging.info("~FB Sample documents:")
for hit in results["hits"]["hits"]:
logging.info(f"Document ID: {hit['_id']}")
logging.info(f"Fields: {list(hit.get('_source', {}).keys())}")
logging.info("---")
except elasticsearch.exceptions.NotFoundError as e:
logging.info(f"~FR Error accessing index: {e}")
except Exception as e:
logging.info(f"~FR Unexpected error: {e}")
class SearchFeed:
_es_client = None