NewsBlur-viq/apps/analyzer/classifier.py
2024-04-24 09:50:42 -04:00

176 lines
5.6 KiB
Python

import math
from django.db.models.aggregates import Sum
from apps.analyzer.models import Category, FeatureCategory
class Classifier:
def __init__(self, user, feed, phrases):
self.user = user
self.feed = feed
self.phrases = phrases
def get_features(self, doc):
found = {}
for phrase in self.phrases:
if phrase in doc:
if phrase in found:
found[phrase] += 1
else:
found[phrase] = 1
return found
def increment_feature(self, feature, category):
count = self.feature_count(feature, category)
if count == 0:
fc = FeatureCategory(user=self.user, feed=self.feed, feature=feature, category=category, count=1)
fc.save()
else:
fc = FeatureCategory.objects.get(
user=self.user, feed=self.feed, feature=feature, category=category
)
fc.count = count + 1
fc.save()
def feature_count(self, feature, category):
if isinstance(category, Category):
category = category.category
try:
feature_count = FeatureCategory.objects.get(
user=self.user, feed=self.feed, feature=feature, category=category
)
except FeatureCategory.DoesNotExist:
return 0
else:
return float(feature_count.count)
def increment_category(self, category):
count = self.category_count(category)
if count == 0:
category = Category(user=self.user, feed=self.feed, category=category, count=1)
category.save()
else:
category = Category.objects.get(user=self.user, feed=self.feed, category=category)
category.count = count + 1
category.save()
def category_count(self, category):
if not isinstance(category, Category):
try:
category_count = Category.objects.get(user=self.user, feed=self.feed, category=category)
except Category.DoesNotExist:
return 0
else:
category_count = category
return float(category_count.count)
def categories(self):
categories = Category.objects.all()
return categories
def totalcount(self):
categories = Category.objects.filter(user=self.user, feed=self.feed).aggregate(sum=Sum("count"))
return categories["sum"]
def train(self, item, category):
features = self.get_features(item)
# Increment the count for every feature with this category
for feature in features:
self.increment_feature(feature, category)
# Increment the count for this category
self.increment_category(category)
def feature_probability(self, feature, category):
if self.category_count(category) == 0:
return 0
# The total number of times this feature appeared in this
# category divided by the total number of items in this category
return self.feature_count(feature, category) / self.category_count(category)
def weighted_probability(self, feature, category, prf, weight=1.0, ap=0.5):
# Calculate current probability
basic_prob = prf(feature, category)
# Count the number of times this feature has appeared in all categories
totals = sum([self.feature_count(feature, c) for c in self.categories()])
# Calculate the weighted average
bp = ((weight * ap) + (totals * basic_prob)) / (weight + totals)
print(feature, category, basic_prob, totals, bp)
return bp
class FisherClassifier(Classifier):
def __init__(self, user, feed, phrases):
Classifier.__init__(self, user, feed, phrases)
self.minimums = {}
def category_probability(self, feature, category):
# The frequency of this feature in this category
clf = self.feature_probability(feature, category)
if clf == 0:
return 0
# The frequency of this feature in all the categories
freqsum = sum([self.feature_probability(feature, category) for c in self.categories()])
# The probability is the frequency in this category divided by
# the overall frequency
p = clf / freqsum
return p
def fisher_probability(self, item, category):
# Multiply all the probabilities together
p = 0.5
features = self.get_features(item)
if features:
p = 1
for feature in features:
p *= self.weighted_probability(feature, category, self.category_probability)
# Take the natural log and multiply by -2
fscore = -2 * math.log(p)
# Use the inverse chi2 function to get a probability
return self.invchi2(fscore, len(features) * 2)
def invchi2(self, chi, df):
m = chi / 2.0
sum = term = math.exp(-m)
for i in range(1, df // 2):
term *= m / i
sum += term
return min(sum, 1.0)
def setminimum(self, category, min):
self.minimums[category] = min
def getminimum(self, category):
if category not in self.minimums:
return 0
return self.minimums[category]
def classify(self, item, default=None):
# Loop through looking for the best result
best = default
max = 0.0
print(self.categories(), item)
for category in self.categories():
p = self.fisher_probability(item, category)
# Make sure it exceeds its minimum
if p > self.getminimum(category) and p > max:
best = category
max = p
return best