Source code for kpop.population.classification

from .attr import Attr
from ..classifiers import SklearnClassifier
from ..libs import np
from ..libs import sk_naive_bayes, sk_svm
from ..prob import Prob
from ..utils.checks import is_sklearn_classifier


[docs]class Classification(Attr):
    """
    Implements the population.classification attribute.
    """

    _methods = ('naive_bayes',)

    def __call__(self, labels=None, which='naive_bayes', **kwargs):
        if is_sklearn_classifier(which):
            return self.sklearn(which, labels, **kwargs)
        elif callable(which):
            raise NotImplementedError('do not accept function classifiers')
        elif isinstance(which, str):
            which_ = which.lower().replace('-', '_')
            if which_ in self._methods:
                method = getattr(self, which_)
                return method(labels, **kwargs)
        raise ValueError('invalid method: %r' % which)

    def _normalize_labels(self, labels=None):
        "Normalizes the labels attribute and return a sequence of labels."

        pop = self._population
        if labels is None or labels == '':
            try:
                return pop.meta['labels']
            except KeyError:
                raise ValueError('could not fetch labels from metadata')
        elif isinstance(labels, str):
            if labels in pop.meta:
                return pop.meta[labels]
            if labels == 'ancestry':
                return ancestry_labels(pop)
            raise ValueError('could not find %r metadata' % labels)
        else:
            labels = list(labels)
            if len(labels) != pop.size:
                raise ValueError(
                    'list of labels must have the same size as the population'
                )
            return labels

[docs]    def naive_bayes(self, labels=None, data='count', prior='uniform',
                    alpha=0.5):
        """
        Classify objects using the naive_bayes classifier.


        Args:
            labels:
                List of labels or a string with the metadata column used as
                label. Optionally, the 'ancestry' string classify using the
                sub-populations as labels.
            alpha:
                Additive (Laplace/Lidstone) smoothing parameter (0 for no
                smoothing).
            prior:
                The prior probability for each label. Must be either a Prob()
                object, the string 'uniform' or None. The default value is
                'uniform' that assigns a fixed uniform prior. If prior is None,
                it learns priors from data. Finally, it can also be specified
                as a Prob() object or a mapping from labels to probabilities.
        """

        kwargs = {'alpha': alpha}

        # Prepare prior data
        if prior == 'uniform':
            kwargs['fit_prior'] = False
            kwargs['class_prior'] = None
        elif prior is None:
            kwargs['fit_prior'] = True
            kwargs['class_prior'] = None
        else:
            labels = self._normalize_labels(labels)
            prob = Prob(prior)
            prob_vector = np.zeros(len(prob))
            for i, label in enumerate(sorted(set(labels))):
                prob_vector[i] = prob[label]
            kwargs['fit_prior'] = True
            kwargs['class_prior'] = prob_vector

        if data == 'count':
            classifier = sk_naive_bayes.MultinomialNB
        else:
            raise ValueError(
                'naive bayes only accepts "count" and "flat" for the data '
                'argument'
            )
        return self.sklearn(classifier, labels, data=data, **kwargs)

[docs]    def svm(self, labels=None, data='count', **kwargs):
        """
        Classify objects using the Support Vector Machine (SVM) classifier.
        """

        classifier = sk_svm.SVC
        return self.sklearn(classifier, labels, data=data, **kwargs)

[docs]    def sklearn(self, classifier, labels=None, data='count', **kwargs):
        """
        Uses a scikit learn classifier to classify population.

        Args:
            classifier:
                A scikit learn classifier class (e.g.,
                sklearn.naive_bayes.BernoulliNB)
            labels:
                A sequence of labels used to train the classifier.
            data (str):
                The method used to convert the population to a usable data set.
                It uses the same options as in the :meth:`Population.as_array`
                method.
        """
        func = lambda pop: pop.as_array(data)
        raw_data = func(self._population)
        labels = self._normalize_labels(labels)
        return SklearnClassifier(classifier, raw_data, func, labels, **kwargs)


def ancestry_labels(pop):
    """
    Return a list of labels from a population object using subpopulation ids as
    data.
    """

    labels = []
    for i, subpop in enumerate(pop.populations):
        label = subpop.id or i
        labels.extend([label] * subpop.size)
    return labels