Source code for pyanno.modelA

# Copyright (c) 2011, Enthought, Ltd.
# Authors: Pietro Berkes <pberkes@enthought.com>, Andrey Rzhetsky
# License: Modified BSD license (2-clause)

"""This module defines the class ModelA, an implementation of model A from
Rzhetsky et al., 2009.

The implementation assumes that there are a total or 8 annotators. Each item is
annotated by a triplet of annotators, according to the loop design described
in Rzhetsky et al., 2009.

E.g., for 16 items the loop design looks like this (`A` indicates a label,
`*` indicates a missing value): ::

    A A A * * * * *
    A A A * * * * *
    * A A A * * * *
    * A A A * * * *
    * * A A A * * *
    * * A A A * * *
    * * * A A A * *
    * * * A A A * *
    * * * * A A A *
    * * * * A A A *
    * * * * * A A A
    * * * * * A A A
    A * * * * * A A
    A * * * * * A A
    A A * * * * * A
    A A * * * * * A

**Reference**

* Rzhetsky A., Shatkay, H., and Wilbur, W.J. (2009). "How to get the most from
  your curation effort", PLoS Computational Biology, 5(5).
"""

from collections import defaultdict
import numpy as np
import scipy.optimize
import scipy.stats
from traits.trait_numeric import Array
from traits.trait_types import Int
from pyanno.abstract_model import AbstractModel
from pyanno.sampling import optimize_step_size, sample_distribution
from pyanno.util import (compute_counts, random_categorical,
                         labels_frequency, MISSING_VALUE, SMALLEST_FLOAT,
                         ninf_to_num )

import logging
logger = logging.getLogger(__name__)


_compatibility_tables_cache = {}
def _compatibility_tables(nclasses):
    """Return a map from agreement indices to annotation patterns.

    The agreement indices are defined as in Table 3 of
    Rzhetsky et al., 2009, supplementary material:
    0=aaa, 1=aaA, 2=aAa, 3=Aaa, 4=Aa@

    The dictionary maps an agreement index to an array of annotations
    compatible with the corresponding agreement pattern.
    For example, for nclasses=3 and index=1 the array contains these
    annotations:
    0 0 1
    0 0 2
    1 1 0
    1 1 2
    2 2 0
    2 2 1
    """

    if not _compatibility_tables_cache.has_key(nclasses):
        compatibility = defaultdict(list)

        # aaa
        for psi1 in range(nclasses):
            compatibility[0].append([psi1, psi1, psi1])

        # aaA, aAa, Aaa
        for psi1 in range(nclasses):
            for psi2 in range(nclasses):
                if psi2 == psi1: continue
                compatibility[1].append([psi1, psi1, psi2])
                compatibility[2].append([psi1, psi2, psi1])
                compatibility[3].append([psi2, psi1, psi1])

        # Aa@
        for psi1 in range(nclasses):
            for psi2 in range(nclasses):
                for psi3 in range(nclasses):
                    if psi1==psi2 or psi2==psi3 or psi1==psi3: continue
                    compatibility[4].append([psi1, psi2, psi3])

        for idx in range(5):
            compatibility[idx] = np.array(compatibility[idx], dtype=int)

        _compatibility_tables_cache[nclasses] = compatibility

    return _compatibility_tables_cache[nclasses]


def _triplet_to_counts_index(triplet, nclasses):
    """Map annotator triplets to data indices for the counts format.
    """
    return (triplet * np.array([nclasses**2, nclasses, 1])).sum(1)


[docs]class ModelA(AbstractModel): """Implementation of Model A from (Rzhetsky et al., 2009). The model defines a probability distribution over data annotations in which each item is annotated by three users. The distributions is described according to a three-steps generative model: 1. First, the model independently generates correctness values for the triplet of annotators (e.g., CCI where C=correct, I=incorrect) 2. Second, the model generates an agreement pattern compatible with the correctness values (e.g., CII is compatible with the agreement patterns 'abb' and 'abc', where different letters correspond to different annotations 3. Finally, the model generates actual observations compatible with the agreement patterns The model has two main sets of parameters: - theta[j] is the probability that annotator j is correct - omega[k] is the probability of observing an annotation of class `k` over all items and annotators At the moment the implementation of the model assumes 1) a total of 8 annotators, and 2) each item is annotated by exactly 3 annotators. See the documentation for a more detailed description of the model. **Reference** * Rzhetsky A., Shatkay, H., and Wilbur, W.J. (2009). "How to get the most from your curation effort", PLoS Computational Biology, 5(5). """ ######## Model traits # number of label classes nclasses = Int # number of annotators nannotators = Int(8) # number of annotators rating each item in the loop design nannotators_per_item = Int(3) #### Model parameters # theta[j] is the probability that annotator j is correct theta = Array(dtype=float, shape=(None,)) # omega[k] is the probability of observing label class k omega = Array(dtype=float, shape=(None,)) def __init__(self, nclasses, theta, omega, **traits): """Create an instance of ModelA. Arguments --------- nclasses : int Number of possible annotation classes theta : ndarray, shape = (n_annotators, ) theta[j] is the probability of annotator j being correct omega : ndarray, shape = (n_classes, ) omega[k] is the probability of observing a label of class k """ self.nclasses = nclasses self.theta = theta self.omega = omega super(ModelA, self).__init__(**traits) ##### Model and data generation methods ################################### @staticmethod
[docs] def create_initial_state(nclasses, theta=None, omega=None): """Factory method to create a new model. It is often more convenient to use this factory method over the constructor, as one does not need to specify the initial model parameters. If not specified, the parameters theta are drawn from a uniform distribution between 0.6 and 0.95 . The parameters omega are drawn from a Dirichlet distribution with parameters 2.0 : :math:`\\theta_j \sim \mathrm{Uniform}(0.6, 0.95)` :math:`\omega_k \sim \mathrm{Dirichlet}(2.0)` Arguments --------- nclasses : int number of possible annotation classes theta : ndarray, shape = (n_annotators, ) theta[j] is the probability of annotator j being correct omega : ndarray, shape = (n_classes, ) omega[k] is the probability of observing a label of class k """ if theta is None: nannotators = 8 theta = ModelA._random_theta(nannotators) if omega is None: omega = ModelA._random_omega(nclasses) return ModelA(nclasses, theta, omega)
@staticmethod def _random_theta(nannotators): return np.random.uniform(low=0.6, high=0.95, size=(nannotators,)) @staticmethod def _random_omega(nclasses): beta = 2.*np.ones((nclasses,)) return np.random.dirichlet(beta)
[docs] def generate_annotations(self, nitems): """Generate random annotations from the model. The method samples random annotations from the probability distribution defined by the model parameters: 1) generate correct/incorrect labels for the three annotators, according to the parameters `theta` 2) generate agreement patterns (which annotator agrees which whom) given the correctness information and the parameters `alpha` 3) generate the annotations given the agreement patterns and the parameters `omega` Note that, according to the model's definition, only three annotators per item return an annotation. Non-observed annotations have the standard value of :attr:`~pyanno.util.MISSING_VALUE`. Arguments --------- nitems : int number of annotations to draw from the model Returns ------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i """ theta = self.theta nannotators = self.nannotators nitems_per_loop = np.ceil(float(nitems) / nannotators) annotations = np.empty((nitems, nannotators), dtype=int) annotations.fill(MISSING_VALUE) # loop over annotator triplets (loop design) for j in range(nannotators): triplet_indices = np.arange(j, j+3) % self.nannotators start_idx = j*nitems_per_loop stop_idx = min(nitems, (j+1)*nitems_per_loop) nitems_this_loop = stop_idx - start_idx # -- step 1: generate correct / incorrect labels # parameters for this triplet theta_triplet = self.theta[triplet_indices] incorrect = self._generate_incorrectness(nitems_this_loop, theta_triplet) # -- step 2: generate agreement patterns given correctness # convert boolean correctness into combination indices # (indices as in Table 3) agreement = self._generate_agreement(incorrect) # -- step 3: generate annotations annotations[start_idx:stop_idx,triplet_indices] = ( self._generate_annotations(agreement) ) return annotations
def _generate_incorrectness(self, n, theta_triplet): _rnd = np.random.rand(n, self.nannotators_per_item) incorrect = _rnd >= theta_triplet return incorrect def _generate_agreement(self, incorrect): """Return indices of agreement pattern given correctness pattern. The indices returned correspond to agreement patterns as in Table 3: 0=aaa, 1=aaA, 2=aAa, 3=Aaa, 4=Aa@ """ # create tensor A_ijk # (cf. Table 3 in Rzhetsky et al., 2009, suppl. mat.) alpha = self._compute_alpha() agreement_tbl = np.array( [[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., alpha[0], 1.-alpha[0]], [0., 0., alpha[1], 0., 1.-alpha[1]], [0., alpha[2], 0., 0., 1.-alpha[2]], [alpha[3], alpha[4], alpha[5], alpha[6], 1.-alpha[3:].sum()]]) # this array maps boolean correctness patterns (e.g., CCI) to # indices in the agreement tensor, `agreement_tbl` correctness_to_agreement_idx = np.array([0, 3, 2, 6, 1, 5, 4, 7]) # convert correctness pattern to index in the A_ijk tensor correct_idx = correctness_to_agreement_idx[ incorrect[:,0]*1 + incorrect[:,1]*2 + incorrect[:,2]*4] # the indices stored in `agreement` correspond to agreement patterns # as in Table 3: 0=aaa, 1=aaA, 2=aAa, 3=Aaa, 4=Aa@ nitems_per_loop = incorrect.shape[0] agreement = np.empty((nitems_per_loop,), dtype=int) for i in xrange(nitems_per_loop): # generate agreement pattern according to A_ijk agreement[i] = random_categorical( agreement_tbl[correct_idx[i]], 1) return agreement def _generate_annotations(self, agreement): """Generate triplet annotations given agreement pattern.""" nitems_per_loop = agreement.shape[0] omega = self.omega annotations = np.empty((nitems_per_loop, 3), dtype=int) for i in xrange(nitems_per_loop): # get all compatible annotations compatible = _compatibility_tables(self.nclasses)[agreement[i]] # compute probability of each possible annotation distr = omega[compatible].prod(1) distr /= distr.sum() # draw annotation compatible_idx = random_categorical(distr, 1)[0] annotations[i,:] = compatible[compatible_idx, :] return annotations ##### Parameters estimation methods #######################################
[docs] def mle(self, annotations, estimate_omega=True): """Computes maximum likelihood estimate (MLE) of parameters. Estimate the parameters :attr:`theta` and :attr:`omega` from a set of observed annotations using maximum likelihood estimation. Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i estimate_omega : bool If True, the parameters :attr:`omega` are estimated by the empirical class frequency. If False, :attr:`omega` is left unchanged. """ self._raise_if_incompatible(annotations) def _wrap_lhood(params, counts): self.theta = params return - self._log_likelihood_counts(counts) self._parameter_estimation(_wrap_lhood, annotations, estimate_omega=estimate_omega)
[docs] def map(self, annotations, estimate_omega=True): """Computes maximum a posteriori (MAP) estimate of parameters. Estimate the parameters :attr:`theta` and :attr:`omega` from a set of observed annotations using maximum a posteriori estimation. Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i estimate_omega : bool If True, the parameters :attr:`omega` are estimated by the empirical class frequency. If False, :attr:`omega` is left unchanged. """ self._raise_if_incompatible(annotations) def _wrap_lhood(params, counts): self.theta = params return - (self._log_likelihood_counts(counts) + self._log_prior()) self._parameter_estimation(_wrap_lhood, annotations, estimate_omega=estimate_omega)
def _parameter_estimation(self, objective, annotations, estimate_omega=True): counts = compute_counts(annotations, self.nclasses) params_start, omega = self._random_initial_parameters(annotations, estimate_omega) self.omega = omega logger.info('Start parameters optimization...') params_best = scipy.optimize.fmin(objective, params_start, args=(counts,), xtol=1e-4, ftol=1e-4, disp=False, maxiter=10000) logger.info('Parameters optimization finished') self.theta = params_best def _random_initial_parameters(self, annotations, estimate_omega): # TODO duplication w/ ModelBtLoopDesign if estimate_omega: # estimate omega from observed annotations omega = labels_frequency(annotations, self.nclasses) else: omega = self.omega theta = ModelA._random_theta(self.nannotators) return theta, omega ##### Model likelihood methods ############################################
[docs] def log_likelihood(self, annotations): """Compute the log likelihood of a set of annotations given the model. Returns :math:`\log P(\mathbf{x} | \omega, \\theta)`, where :math:`\mathbf{x}` is the array of annotations. Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i Returns ------- log_lhood : float log likelihood of `annotations` """ self._raise_if_incompatible(annotations) counts = compute_counts(annotations, self.nclasses) return self._log_likelihood_counts(counts) # TODO code duplication with ModelBtLoopDesign -> refactor
def _log_likelihood_counts(self, counts): """Compute the log likelihood of annotations given the model. This method assumes the data is in counts format. """ # TODO: check if it's possible to replace these constraints with bounded optimization # check bounds of parameters (for likelihood optimization) if np.amin(self.theta) <= 0 or np.amax(self.theta) > 1: # return np.inf return SMALLEST_FLOAT # compute alpha and beta (they do not depend on theta) alpha = self._compute_alpha() beta = [None]*5 pattern_to_indices = _compatibility_tables(self.nclasses) for pattern in range(5): indices = pattern_to_indices[pattern] beta[pattern] = self.omega[indices].prod(1) beta[pattern] /= beta[pattern].sum() llhood = 0. # loop over the 8 combinations of annotators for i in range(8): # extract the theta parameters for this triplet triplet_indices = np.arange(i, i+3) % self.nannotators triplet_indices.sort() theta_triplet = self.theta[triplet_indices] # compute the likelihood for the triplet llhood += self._log_likelihood_triplet(counts[:,i], theta_triplet, alpha, beta) return llhood def _log_likelihood_triplet(self, counts_triplet, theta_triplet, alpha, beta): """Compute the log likelihood of data for one triplet of annotators. Input: counts_triplet -- count data for one combination of annotators theta_triplet -- theta parameters of the current triplet """ nclasses = self.nclasses llhood = 0. # loop over all possible agreement patterns # 0=aaa, 1=aaA, 2=aAa, 3=Aaa, 4=Aa@ pattern_to_indices = _compatibility_tables(nclasses) for pattern in range(5): # P( A_ijk | T_ijk ) * P( T_ijk ) , or "alpha * theta triplet" prob = self._prob_a_and_t(pattern, theta_triplet, alpha) # P( V_ijk ! A_ijk) * P( A_ijk | T_ijk ) * P( T_ijk ) # = P( V_ijk | A, T, model) prob *= beta[pattern] # P( V_ijk | model ) = sum over A and T of conditional probability indices = pattern_to_indices[pattern] count_indices = _triplet_to_counts_index(indices, nclasses) log_prob = ninf_to_num(np.log(prob)) llhood += (counts_triplet[count_indices] * log_prob).sum() return llhood def _log_prior(self): """Compute log probability of prior on the theta parameters.""" log_prob = scipy.stats.beta._logpdf(self.theta, 2., 1.).sum() if np.isneginf(log_prob): log_prob = SMALLEST_FLOAT return log_prob def _prob_a_and_t(self, pattern, theta_triplet, alpha): # TODO make more robust by taking logarithms earlier # TODO could be vectorized some more using the A_ijk tensor # 0=aaa, 1=aaA, 2=aAa, 3=Aaa, 4=Aa@ # abbreviations thetat = theta_triplet not_thetat = (1.-theta_triplet) if pattern == 0: # aaa patterns prob = (thetat.prod() + not_thetat.prod() * alpha[3]) elif pattern == 1: # aaA patterns prob = (thetat[0] * thetat[1] * not_thetat[2] + not_thetat[0] * not_thetat[1] * thetat[2] * alpha[2] + not_thetat[0] * not_thetat[1] * not_thetat[2] * alpha[4]) elif pattern == 2: # aAa patterns prob = (thetat[0] * not_thetat[1] * thetat[2] + not_thetat[0] * thetat[1] * not_thetat[2] * alpha[1] + not_thetat[0] * not_thetat[1] * not_thetat[2] * alpha[5]) elif pattern == 3: # Aaa patterns prob = (not_thetat[0] * thetat[1] * thetat[2] + thetat[0] * not_thetat[1] * not_thetat[2] * alpha[0] + not_thetat[0] * not_thetat[1] * not_thetat[2] * alpha[6]) elif pattern == 4: # Aa@ pattern prob = (not_thetat[0] * not_thetat[1] * not_thetat[2] * (1. - alpha[3] - alpha[4] - alpha[5] - alpha[6]) + thetat[0] * not_thetat[1] * not_thetat[2] * (1. - alpha[0]) + not_thetat[0] * thetat[1] * not_thetat[2] * (1. - alpha[1]) + not_thetat[0] * not_thetat[1] * thetat[2] * (1. - alpha[2])) return prob ##### Sampling posterior over parameters ##################################
[docs] def sample_posterior_over_accuracy(self, annotations, nsamples, burn_in_samples = 100, thin_samples = 5, target_rejection_rate = 0.3, rejection_rate_tolerance = 0.2, step_optimization_nsamples = 500, adjust_step_every = 100): """Return samples from posterior distribution over theta given data. Samples are drawn using a variant of a Metropolis-Hasting Markov Chain Monte Carlo (MCMC) algorithm. Sampling proceeds in two phases: 1) *step size estimation phase*: first, the step size in the MCMC algorithm is adjusted to achieve a given rejection rate. 2) *sampling phase*: second, samples are collected using the step size from phase 1. Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i nsamples : int number of samples to draw from the posterior burn_in_samples : int Discard the first `burn_in_samples` during the initial burn-in phase, where the Monte Carlo chain converges to the posterior thin_samples : int Only return one every `thin_samples` samples in order to reduce the auto-correlation in the sampling chain. This is called "thinning" in MCMC parlance. target_rejection_rate : float target rejection rate for the step size estimation phase rejection_rate_tolerance : float the step size estimation phase is ended when the rejection rate for all parameters is within `rejection_rate_tolerance` from `target_rejection_rate` step_optimization_nsamples : int number of samples to draw in the step size estimation phase adjust_step_every : int number of samples after which the step size is adjusted during the step size estimation pahse Returns ------- samples : ndarray, shape = (n_samples, n_annotators) samples[i,:] is one sample from the posterior distribution over the parameters `theta` """ self._raise_if_incompatible(annotations) nsamples = self._compute_total_nsamples(nsamples, burn_in_samples, thin_samples) # optimize step size counts = compute_counts(annotations, self.nclasses) # wrap log likelihood function to give it to optimize_step_size and # sample_distribution _llhood_counts = self._log_likelihood_counts _log_prior = self._log_prior def _wrap_llhood(params, counts): self.theta = params return _llhood_counts(counts) + _log_prior() # TODO this save-reset is rather ugly, refactor: create copy of # model and sample over it # save internal parameters to reset at the end of sampling save_params = self.theta try: # compute optimal step size for given target rejection rate params_start = self.theta.copy() params_upper = np.ones((self.nannotators,)) params_lower = np.zeros((self.nannotators,)) step = optimize_step_size(_wrap_llhood, params_start, counts, params_lower, params_upper, step_optimization_nsamples, adjust_step_every, target_rejection_rate, rejection_rate_tolerance) # draw samples from posterior distribution over theta samples = sample_distribution(_wrap_llhood, params_start, counts, step, nsamples, params_lower, params_upper) return self._post_process_samples(samples, burn_in_samples, thin_samples) finally: # reset parameters self.theta = save_params ##### Posterior distributions ############################################# # TODO ideally, one would infer the posterior over correctness (T_ijk) # first, and then return the probability of each value # def infer_correctness(self, annotations): # """Infer posterior distribution over correctness patterns.""" # nitems = annotations.shape[0] # nclasses = self.nclasses # # posterior = np.zeros((nitems, self.annotators_per_item**2)) # alpha = self._compute_alpha() # for i, row in enumerate(annotations): # valid_idx = np.where(row >= 0) # vijk = row[valid_idx] # tijk = self.theta[valid_idx] # p = self._compute_posterior_T_triplet(vijk, tijk, alpha) # posteriors[i, :] = p # # return posteriors # # # def _compute_posterior_T_triplet(self, v, t, alpha): # # switch over agreement pattern # # 0=aaa, 1=aaA, 2=aAa, 3=Aaa, 4=Aa@ # if v[0] == v[1] == v[2]: # aaa pattern # pass
[docs] def infer_labels(self, annotations): """Infer posterior distribution over label classes. Compute the posterior distribution over label classes given observed annotations, :math:`P( \mathbf{y} | \mathbf{x}, \\theta, \omega)`. Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i Returns ------- posterior : ndarray, shape = (n_items, n_classes) posterior[i,k] is the posterior probability of class k given the annotation observed in item i. """ self._raise_if_incompatible(annotations) nitems = annotations.shape[0] nclasses = self.nclasses posteriors = np.zeros((nitems, nclasses)) alpha = self._compute_alpha() i = 0 for row in annotations: ind = np.where(row >= 0) vijk = row[ind] tijk = self.theta[ind].copy() p = self._compute_posterior_triplet(vijk, tijk, alpha) posteriors[i, :] = p i += 1 return posteriors
def _compute_posterior_triplet(self, vijk, tijk, alpha): nclasses = self.nclasses posteriors = np.zeros(nclasses, float) #----------------------------------------------- # aaa if vijk[0] == vijk[1] and vijk[1] == vijk[2]: x1 = tijk[0] * tijk[1] * tijk[2] x2 = (1 - tijk[0]) * (1 - tijk[1]) * (1 - tijk[2]) p1 = x1 / (x1 + alpha[3] * x2) p2 = (1 - p1) / (nclasses - 1) for j in range(nclasses): if vijk[0] == j: posteriors[j] = p1 else: posteriors[j] = p2 #----------------------------------------------- # aaA elif vijk[0] == vijk[1] and vijk[1] != vijk[2]: x1 = tijk[0] * tijk[1] * (1 - tijk[2]) x2 = (1 - tijk[0]) * (1 - tijk[1]) * tijk[2] x3 = (1 - tijk[0]) * (1 - tijk[1]) * (1 - tijk[2]) # a is correct p1 = x1 / (x1 + alpha[2] * x2 + alpha[4] * x3) # A is correct p2 = (alpha[2] * x2) / (x1 + alpha[2] * x2 + alpha[4] * x3) # neither p3 = (1 - p1 - p2) / (nclasses - 2) for j in range(nclasses): if vijk[0] == j: posteriors[j] = p1 elif vijk[2] == j: posteriors[j] = p2 else: posteriors[j] = p3 #----------------------------------------------- # aAa elif vijk[0] == vijk[2] and vijk[1] != vijk[2]: x1 = tijk[0] * (1 - tijk[1]) * tijk[2] x2 = (1 - tijk[0]) * tijk[1] * (1 - tijk[2]) x3 = (1 - tijk[0]) * (1 - tijk[1]) * (1 - tijk[2]) # a is correct p1 = x1 / (x1 + alpha[1] * x2 + alpha[5] * x3) # A is correct p2 = (alpha[1] * x2) / (x1 + alpha[1] * x2 + alpha[5] * x3) # neither p3 = (1 - p1 - p2) / (nclasses - 2) for j in range(nclasses): if vijk[0] == j: posteriors[j] = p1 elif vijk[1] == j: posteriors[j] = p2 else: posteriors[j] = p3 #----------------------------------------------- # Aaa elif vijk[1] == vijk[2] and vijk[0] != vijk[2]: x1 = (1 - tijk[0]) * tijk[1] * tijk[2] x2 = tijk[0] * (1 - tijk[1]) * (1 - tijk[2]) x3 = (1 - tijk[0]) * (1 - tijk[1]) * (1 - tijk[2]) # a is correct p1 = x1 / (x1 + alpha[0] * x2 + alpha[6] * x3) # A is correct p2 = (alpha[0] * x2) / (x1 + alpha[0] * x2 + alpha[6] * x3) # neither p3 = (1 - p1 - p2) / (nclasses - 2) for j in range(nclasses): if vijk[0] == j: posteriors[j] = p2 elif vijk[2] == j: posteriors[j] = p1 else: posteriors[j] = p3 #----------------------------------------------- # aAb elif vijk[0] != vijk[1] and vijk[1] != vijk[2]: x1 = tijk[0] * (1 - tijk[1]) * (1 - tijk[2]) x2 = (1 - tijk[0]) * tijk[1] * (1 - tijk[2]) x3 = (1 - tijk[0]) * (1 - tijk[1]) * tijk[2] x4 = (1 - tijk[0]) * (1 - tijk[1]) * (1 - tijk[2]) summa1 = 1 - alpha[3] - alpha[4] - alpha[5] - alpha[6] summa2 = ((1 - alpha[0]) * x1 + (1 - alpha[1]) * x2 + (1 - alpha[2]) * x3 + summa1 * x4) # a is correct p1 = (1 - alpha[0]) * x1 / summa2 # A is correct p2 = (1 - alpha[1]) * x2 / summa2 # b is correct p3 = (1 - alpha[2]) * x3 / summa2 # (a, A, b) are all incorrect p4 = (summa1 * x4 / summa2) / (nclasses - 3) for j in range(nclasses): if vijk[0] == j: posteriors[j] = p1 elif vijk[1] == j: posteriors[j] = p2 elif vijk[2] == j: posteriors[j] = p3 else: posteriors[j] = p4 # check posteriors: non-negative, sum to 1 assert np.abs(posteriors.sum()-1.) < 1e-6 assert posteriors.min() >= 0. return posteriors def _compute_alpha(self): """Compute the parameters `alpha` given the parameters `omega`. Cf. Table 4 in Rzhetsky et al., 2009. """ omega = self.omega nclasses = self.nclasses alpha = np.zeros((7,)) # ------ alpha_1,2,3 # sum over all doublets outer_omega = np.outer(omega, omega) sum_wi_wk = outer_omega.sum() # sum over all omega_i * omega_j, where i!=k and j!=k sum_wi_wj_not_k = np.zeros((nclasses,)) # sum over all omega_i ** 2, where i!=k sum_wi2_not_k = np.zeros((nclasses,)) for k in range(nclasses): sum_wi_wj_not_k[k] = (sum_wi_wk - 2*outer_omega[:,k].sum() + outer_omega[k,k]) sum_wi2_not_k[k] = (outer_omega.diagonal().sum() - outer_omega[k,k]) a1 = (omega * sum_wi2_not_k / sum_wi_wj_not_k).sum() alpha[0:3] = a1 # ------ alpha_4,5,6,7 # sum over all triplets outer_omega3 = (outer_omega[:,:,np.newaxis] * omega[np.newaxis,np.newaxis,:]) sum_wi_wj_wl = outer_omega3.sum() # sum over omega_i * omega_j * omega_l, where i!=k and j!=k and l!=k sum_wi_wj_wl_not_k = np.zeros((nclasses,)) for k in range(nclasses): sum_wi_wj_wl_not_k[k] = (sum_wi_wj_wl - 3.*outer_omega3[:,:,k].sum() + 3.*outer_omega3[:,k,k].sum() - outer_omega3[k,k,k]) omega3 = omega**3 sum_wi3_not_k = omega3.sum() - omega3 a4 = (omega * sum_wi3_not_k / sum_wi_wj_wl_not_k).sum() alpha[3] = a4 a5 = 0 for i in range(nclasses): tmp = 0 for j in range(nclasses): for k in range(nclasses): if j != i and k != i and j != k: tmp += omega[k] * omega[j] ** 2 a5 += omega[i] * tmp / sum_wi_wj_wl_not_k[i] alpha[4:7] = a5 return alpha ##### Verify input ########################################################
[docs] def are_annotations_compatible(self, annotations): """Check if the annotations are compatible with the models' parameters. """ if not super(ModelA, self).are_annotations_compatible(annotations): return False masked_annotations = np.ma.masked_equal(annotations, MISSING_VALUE) # exactly 3 annotations per row nvalid = (~masked_annotations.mask).sum(1) if not np.all(nvalid == self.nannotators_per_item): return False return True

Table Of Contents