Source code for kpop.prob

import collections
from collections import defaultdict
from math import log
from random import random

from .libs import np


[docs]class Prob(collections.Mapping): """ A dictionary-like object that behaves as a mapping between categories to their respective probabilities. """ __slots__ = ('_data',) @classmethod
[docs] def mixture(cls, coeffs, probs): """ Create a mixture probability from the given coeffs and list of Probs objects. Args: coeffs: Mixture coefficients. These coefficients do not have to be normalized. probs: List of Prob objects. Returns: A Prob object representing the mixture. """ if len(coeffs) != len(probs): raise ValueError('coeffs and probs must be aligned') data = defaultdict(float) for q, prob in zip(coeffs, probs): for k, p in prob.items(): data[k] += q * p return Prob(data)
def __init__(self, data, normalize=True, support=None): try: self._data = dict(data) except TypeError: self._data = dict(enumerate(data)) if normalize: norm = sum(self._data.values()) if norm != 1: for k, v in self._data.items(): self._data[k] = v / norm if support: self.update_support(support) def __getitem__(self, key): return self._data[key] def __len__(self): return len(self._data) def __iter__(self): return iter(self._data) def __repr__(self): return '%s(%r)' % (self.__class__.__name__, self._data) def __eq__(self, other): if isinstance(other, collections.Mapping): nonzero_a = {k: v for k, v in self.items() if v} nonzero_b = {k: v for k, v in other.items() if v} return nonzero_a == nonzero_b return NotImplemented def keys(self): return self._data.keys() def values(self): return self._data.values()
[docs] def update_support(self, support): """ Force all elements in support to be explicitly present in distribution (possibly with null probability). Args: support: a list of elements in the support set for probability distribution. """ setdefault = self._data.setdefault for elem in support: setdefault(elem, 0.0)
[docs] def set_support(self, support): """ Defines the support set of distribution. If elements exist in support, they are forced to exist in distribution, possibly with zero probability. If element exists in the distribution but is not present in support, raises a ValueError. """ support = set(support) for x in self: if x not in support: raise ValueError('%r not in support' % x) self.update_support(support)
[docs] def entropy(self): """ Return the Shannon entropy for the probability distribution. """ return sum(-x * log(x) for x in self.values() if x)
[docs] def random(self): """ Returns a random element. """ r = random() cum_prob = 0 for elem, p in self.items(): cum_prob += p if cum_prob >= r: return elem raise ValueError('probability distribution do not sum to 1.0')
[docs] def random_sequence(self, size): """ Returns a sequence of random elements. """ r = self.random return [r() for _ in range(size)]
[docs] def max(self): """ Return the value of maximum probability. """ return max(self.values())
[docs] def mode(self): """ Return the element with the maximum probability. If more than one element shares the maximum probability, return an arbitrary value within this set. """ p_mode = 0.0 mode = None for elem, p in self.items(): if p >= p_mode: p_mode = p mode = elem return mode
[docs] def mode_set(self): """ Return a set of elements that share the maximum probability. """ p_mode = 0.0 mode_set = set() for elem, p in self.items(): if p > p_mode: mode_set = {elem} p_mode = p elif p == p_mode: mode_set.add(elem) return mode_set
[docs] def sharp(self, mode_set=True): """ Return a sharp version of the probability distribution. All elements receive probability zero, except the mode which receives probability one. """ data = {k: 0.0 for k in self} if mode_set: mode_set = self.mode_set() p_mode = 1. / len(mode_set) for k in mode_set: data[k] = p_mode else: data[self.mode()] = 1.0 return Prob(data)
[docs] def kl_divergence(self, q: collections.Mapping): """ Return the Kullback-Leibler divergence with probability distribution. This is given by the formula: $KL = \sum_i p_i \ln \frac {p_i} {q_i},$ in which p_i comes from the probability object and q_i comes from the argument. """ prob = self._data.get divergence = 0.0 visited = 0 for k, q in q.items(): visited += 1 p = prob(k, 0.0) if p: try: divergence += p and p * log(p / q) except ZeroDivisionError: return float('inf') if len(self._data) != visited: return float('inf') return divergence
[docs] def encode(self, coding=None): """ Encode probability distribution as a vector. Args: coding: a sequence of ordered categories. Example: >>> prob = Prob({'a': 0.75, 'b': 0.25}) >>> prob.encode(['b', 'a']) [0.25, 0.75] """ if coding is None: types = {type(x) for x in self._data} if types == {int}: coding = range(max(self) + 1) else: coding = sorted(self._data) prob = self._data.get return np.array([prob(x, 0.0) for x in coding], dtype=float)