mirror of
https://github.com/viq/NewsBlur.git
synced 2025-04-13 09:38:09 +00:00
57 lines
1.8 KiB
Python
Executable file
57 lines
1.8 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
"""
|
|
The simplest TF-IDF library imaginable.
|
|
Add your documents as two-element lists `[docname, [list_of_words_in_the_document]]` with `addDocument(docname, list_of_words)`. Get a list of all the `[docname, similarity_score]` pairs relative to a document by calling `similarities([list_of_words])`.
|
|
See the README for a usage example.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
|
|
class tfidf:
|
|
def __init__(self):
|
|
self.weighted = False
|
|
self.documents = []
|
|
self.corpus_dict = {}
|
|
|
|
def addDocument(self, doc_name, list_of_words):
|
|
# building a dictionary
|
|
doc_dict = {}
|
|
for w in list_of_words:
|
|
doc_dict[w] = doc_dict.get(w, 0.0) + 1.0
|
|
self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0
|
|
|
|
# normalizing the dictionary
|
|
length = float(len(list_of_words))
|
|
for k in doc_dict:
|
|
doc_dict[k] = doc_dict[k] / length
|
|
|
|
# add the normalized document to the corpus
|
|
self.documents.append([doc_name, doc_dict])
|
|
|
|
def similarities(self, list_of_words):
|
|
"""Returns a list of all the [docname, similarity_score] pairs relative to a list of words."""
|
|
|
|
# building the query dictionary
|
|
query_dict = {}
|
|
for w in list_of_words:
|
|
query_dict[w] = query_dict.get(w, 0.0) + 1.0
|
|
|
|
# normalizing the query
|
|
length = float(len(list_of_words))
|
|
for k in query_dict:
|
|
query_dict[k] = query_dict[k] / length
|
|
|
|
# computing the list of similarities
|
|
sims = []
|
|
for doc in self.documents:
|
|
score = 0.0
|
|
doc_dict = doc[1]
|
|
for k in query_dict:
|
|
if doc_dict.has_key(k):
|
|
score += (query_dict[k] / self.corpus_dict[k]) + (doc_dict[k] / self.corpus_dict[k])
|
|
sims.append([doc[0], score])
|
|
|
|
return sims
|