import math from django.db.models.aggregates import Sum from apps.analyzer.models import Category, FeatureCategory class Classifier: def __init__(self, user, feed, phrases): self.user = user self.feed = feed self.phrases = phrases def get_features(self, doc): found = {} for phrase in self.phrases: if phrase in doc: if phrase in found: found[phrase] += 1 else: found[phrase] = 1 return found def increment_feature(self, feature, category): count = self.feature_count(feature, category) if count == 0: fc = FeatureCategory(user=self.user, feed=self.feed, feature=feature, category=category, count=1) fc.save() else: fc = FeatureCategory.objects.get( user=self.user, feed=self.feed, feature=feature, category=category ) fc.count = count + 1 fc.save() def feature_count(self, feature, category): if isinstance(category, Category): category = category.category try: feature_count = FeatureCategory.objects.get( user=self.user, feed=self.feed, feature=feature, category=category ) except FeatureCategory.DoesNotExist: return 0 else: return float(feature_count.count) def increment_category(self, category): count = self.category_count(category) if count == 0: category = Category(user=self.user, feed=self.feed, category=category, count=1) category.save() else: category = Category.objects.get(user=self.user, feed=self.feed, category=category) category.count = count + 1 category.save() def category_count(self, category): if not isinstance(category, Category): try: category_count = Category.objects.get(user=self.user, feed=self.feed, category=category) except Category.DoesNotExist: return 0 else: category_count = category return float(category_count.count) def categories(self): categories = Category.objects.all() return categories def totalcount(self): categories = Category.objects.filter(user=self.user, feed=self.feed).aggregate(sum=Sum("count")) return categories["sum"] def train(self, item, category): features = self.get_features(item) # Increment the count for every feature with this category for feature in features: self.increment_feature(feature, category) # Increment the count for this category self.increment_category(category) def feature_probability(self, feature, category): if self.category_count(category) == 0: return 0 # The total number of times this feature appeared in this # category divided by the total number of items in this category return self.feature_count(feature, category) / self.category_count(category) def weighted_probability(self, feature, category, prf, weight=1.0, ap=0.5): # Calculate current probability basic_prob = prf(feature, category) # Count the number of times this feature has appeared in all categories totals = sum([self.feature_count(feature, c) for c in self.categories()]) # Calculate the weighted average bp = ((weight * ap) + (totals * basic_prob)) / (weight + totals) print(feature, category, basic_prob, totals, bp) return bp class FisherClassifier(Classifier): def __init__(self, user, feed, phrases): Classifier.__init__(self, user, feed, phrases) self.minimums = {} def category_probability(self, feature, category): # The frequency of this feature in this category clf = self.feature_probability(feature, category) if clf == 0: return 0 # The frequency of this feature in all the categories freqsum = sum([self.feature_probability(feature, category) for c in self.categories()]) # The probability is the frequency in this category divided by # the overall frequency p = clf / freqsum return p def fisher_probability(self, item, category): # Multiply all the probabilities together p = 0.5 features = self.get_features(item) if features: p = 1 for feature in features: p *= self.weighted_probability(feature, category, self.category_probability) # Take the natural log and multiply by -2 fscore = -2 * math.log(p) # Use the inverse chi2 function to get a probability return self.invchi2(fscore, len(features) * 2) def invchi2(self, chi, df): m = chi / 2.0 sum = term = math.exp(-m) for i in range(1, df // 2): term *= m / i sum += term return min(sum, 1.0) def setminimum(self, category, min): self.minimums[category] = min def getminimum(self, category): if category not in self.minimums: return 0 return self.minimums[category] def classify(self, item, default=None): # Loop through looking for the best result best = default max = 0.0 print(self.categories(), item) for category in self.categories(): p = self.fisher_probability(item, category) # Make sure it exceeds its minimum if p > self.getminimum(category) and p > max: best = category max = p return best