from .attr import Attr
from ..classifiers import SklearnClassifier
from ..libs import np
from ..libs import sk_naive_bayes, sk_svm
from ..prob import Prob
from ..utils.checks import is_sklearn_classifier
[docs]class Classification(Attr):
"""
Implements the population.classification attribute.
"""
_methods = ('naive_bayes',)
def __call__(self, labels=None, which='naive_bayes', **kwargs):
if is_sklearn_classifier(which):
return self.sklearn(which, labels, **kwargs)
elif callable(which):
raise NotImplementedError('do not accept function classifiers')
elif isinstance(which, str):
which_ = which.lower().replace('-', '_')
if which_ in self._methods:
method = getattr(self, which_)
return method(labels, **kwargs)
raise ValueError('invalid method: %r' % which)
def _normalize_labels(self, labels=None):
"Normalizes the labels attribute and return a sequence of labels."
pop = self._population
if labels is None or labels == '':
try:
return pop.meta['labels']
except KeyError:
raise ValueError('could not fetch labels from metadata')
elif isinstance(labels, str):
if labels in pop.meta:
return pop.meta[labels]
if labels == 'ancestry':
return ancestry_labels(pop)
raise ValueError('could not find %r metadata' % labels)
else:
labels = list(labels)
if len(labels) != pop.size:
raise ValueError(
'list of labels must have the same size as the population'
)
return labels
[docs] def naive_bayes(self, labels=None, data='count', prior='uniform',
alpha=0.5):
"""
Classify objects using the naive_bayes classifier.
Args:
labels:
List of labels or a string with the metadata column used as
label. Optionally, the 'ancestry' string classify using the
sub-populations as labels.
alpha:
Additive (Laplace/Lidstone) smoothing parameter (0 for no
smoothing).
prior:
The prior probability for each label. Must be either a Prob()
object, the string 'uniform' or None. The default value is
'uniform' that assigns a fixed uniform prior. If prior is None,
it learns priors from data. Finally, it can also be specified
as a Prob() object or a mapping from labels to probabilities.
"""
kwargs = {'alpha': alpha}
# Prepare prior data
if prior == 'uniform':
kwargs['fit_prior'] = False
kwargs['class_prior'] = None
elif prior is None:
kwargs['fit_prior'] = True
kwargs['class_prior'] = None
else:
labels = self._normalize_labels(labels)
prob = Prob(prior)
prob_vector = np.zeros(len(prob))
for i, label in enumerate(sorted(set(labels))):
prob_vector[i] = prob[label]
kwargs['fit_prior'] = True
kwargs['class_prior'] = prob_vector
if data == 'count':
classifier = sk_naive_bayes.MultinomialNB
else:
raise ValueError(
'naive bayes only accepts "count" and "flat" for the data '
'argument'
)
return self.sklearn(classifier, labels, data=data, **kwargs)
[docs] def svm(self, labels=None, data='count', **kwargs):
"""
Classify objects using the Support Vector Machine (SVM) classifier.
"""
classifier = sk_svm.SVC
return self.sklearn(classifier, labels, data=data, **kwargs)
[docs] def sklearn(self, classifier, labels=None, data='count', **kwargs):
"""
Uses a scikit learn classifier to classify population.
Args:
classifier:
A scikit learn classifier class (e.g.,
sklearn.naive_bayes.BernoulliNB)
labels:
A sequence of labels used to train the classifier.
data (str):
The method used to convert the population to a usable data set.
It uses the same options as in the :meth:`Population.as_array`
method.
"""
func = lambda pop: pop.as_array(data)
raw_data = func(self._population)
labels = self._normalize_labels(labels)
return SklearnClassifier(classifier, raw_data, func, labels, **kwargs)
def ancestry_labels(pop):
"""
Return a list of labels from a population object using subpopulation ids as
data.
"""
labels = []
for i, subpop in enumerate(pop.populations):
label = subpop.id or i
labels.extend([label] * subpop.size)
return labels