Source code for pyanno.measures.helpers

# Copyright (c) 2011, Enthought, Ltd.
# Author: Pietro Berkes <pberkes@enthought.com>
# License: Modified BSD license (2-clause)

from __future__ import division

import numpy as np
from pyanno.util import labels_count, labels_frequency, is_valid


[docs]def pairwise_matrix(pairwise_statistic, annotations, *args, **kwargs): """Compute the matrix of all combinations of a pairwise statistics. This function applies an agreement or covariation statistic that is only defined for pairs of annotators to all combinations of annotators pairs, and returns a matrix of the result. Example :: >>> from pyanno.measures import pairwise_matrix, cohens_kappa >>> stat_matrix = pairwise_matrix(cohens_kappa, annotations, nclasses=4) Arguments --------- pairwise_statistics : function Function accepting as first two arguments two 1D array of annotations, and returning a single scalar measuring some annotations statistics. annotations : ndarray, shape = (n_items, n_annotators) Annotations in pyanno format. args : any Additional arguments passed to `pairwise_statistics`. kwargs : any Additional keyword arguments passed to `pairwise_statistics`. Returns ------- stat_matrix : ndarray, shape = (n_annotators, n_annotators) `stat_matrix[i,j]` is the value of `pairwise_statistics` applied to the annotations of annotators `i` and `j` """ nannotators = annotations.shape[1] pairwise = np.empty((nannotators, nannotators), dtype=float) for i in range(nannotators): for j in range(nannotators): pairwise[i,j] = pairwise_statistic(annotations[:,i], annotations[:,j], *args, **kwargs) return pairwise
[docs]def confusion_matrix(annotations1, annotations2, nclasses): """Compute confusion matrix from pairs of annotations. **References** * `Wikipedia entry <http://en.wikipedia.org/wiki/Confusion_matrix>`_ Arguments --------- annotations1 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` annotations2 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` nclasses : int Number of annotation classes. If None, `nclasses` is inferred from the values in the annotations Returns ------- conf_mat : ndarray, shape = (n_classes, n_classes) Confusion matrix; conf_mat[i,j] = number of observations that was annotated as category `i` by annotator 1 and as `j` by annotator 2 """ conf_mat = np.empty((nclasses, nclasses), dtype=float) for i in range(nclasses): for j in range(nclasses): conf_mat[i, j] = np.sum(np.logical_and(annotations1 == i, annotations2 == j)) return conf_mat
[docs]def coincidence_matrix(annotations, nclasses): """Build coincidence matrix. The element c,k of the coincidence matrix contains the number of c-k pairs in the data (across annotators), over the total number of observed pairs. **Reference** * `Wikipedia entry <http://en.wikipedia.org/wiki/Krippendorff%27s_Alpha#Coincidence_matrices>`_ Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) Array of annotations for multiple annotators. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` nclasses : int Number of annotation classes. If None, `nclasses` is inferred from the values in the annotations Returns ------- coinc_mat : ndarray, shape = (n_classes, n_classes) Coincidence matrix """ # total number of annotations in row nannotations = is_valid(annotations).sum(1).astype(float) valid = nannotations > 1 nannotations = nannotations[valid] annotations = annotations[valid,:] # number of annotations of class c in row nc_in_row = np.empty((nannotations.shape[0], nclasses), dtype=int) for c in range(nclasses): nc_in_row[:, c] = (annotations == c).sum(1) coincidences = np.empty((nclasses, nclasses), dtype=float) for c in range(nclasses): for k in range(nclasses): if c==k: nck_pairs = nc_in_row[:, c] * (nc_in_row[:, c] - 1) else: nck_pairs = nc_in_row[:, c] * nc_in_row[:, k] coincidences[c, k] = (nck_pairs / (nannotations - 1.)).sum() return coincidences
[docs]def chance_adjusted_agreement(observed_agreement, chance_agreement): """Return the chance-adjusted agreement given the specified agreement and expected agreement. Defined by (observed_agreement - chance_agreement)/(1.0 - chance_agreement) Arguments --------- observed_agreement : float Agreement computed from the data chance_agreement : float Agreement expected by chance give the assumptions of the statistics Return ------ result : float Chance adjusted agreement value """ return (observed_agreement - chance_agreement) / (1. - chance_agreement)
[docs]def observed_agreement_frequency(annotations1, annotations2, nclasses): """Observed frequency of agreement by two annotators. If a category is never observed, the frequency for that category is set to 0.0 . Only count entries where both annotators responded toward observed frequency. Arguments --------- annotations1 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` annotations2 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` weights_func : function(m_i, m_j) Weights function that receives two matrices of indices i, j and returns the matrix of weights between them. Default is :func:`~pyanno.measures.distances.diagonal_distance` Return ------ result : float Observed agreement frequency value """ conf_mat = confusion_matrix(annotations1, annotations2, nclasses) conf_mat_sum = conf_mat.sum() if conf_mat_sum != 0: observed_agreement = conf_mat.diagonal() / conf_mat_sum else: observed_agreement = np.empty((nclasses,), dtype=float) observed_agreement.fill(np.nan) return observed_agreement
[docs]def chance_agreement_same_frequency(annotations1, annotations2, nclasses): """Expected frequency of agreement by random annotations. Assumes that the annotators draw random annotations with the same frequency as the combined observed annotations. Arguments --------- annotations1 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` annotations2 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` weights_func : function(m_i, m_j) Weights function that receives two matrices of indices i, j and returns the matrix of weights between them. Default is :func:`~pyanno.measures.distances.diagonal_distance` Return ------ result : float Chance agreement value """ count1 = labels_count(annotations1, nclasses) count2 = labels_count(annotations2, nclasses) count_total = count1 + count2 total = count_total.sum() chance_agreement = (count_total / total) ** 2. return chance_agreement
[docs]def chance_agreement_different_frequency(annotations1, annotations2, nclasses): """Expected frequency of agreement by random annotations. Assumes that the annotators draw annotations at random with different but constant frequencies. Arguments --------- annotations1 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` annotations2 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` weights_func : function(m_i, m_j) Weights function that receives two matrices of indices i, j and returns the matrix of weights between them. Default is :func:`~pyanno.measures.distances.diagonal_distance` Return ------ result : float Chance agreement value """ freq1 = labels_frequency(annotations1, nclasses) freq2 = labels_frequency(annotations2, nclasses) chance_agreement = freq1 * freq2 return chance_agreement
[docs]def compute_nclasses(*annotations): """Infer the number of label classes from the data.""" max_ = np.amax(map(np.amax, annotations)) return max_ + 1
[docs]def all_invalid(*annotations): """Return True if all annotations are invalid.""" for anno in annotations: if np.any(is_valid(anno)): return False return True

Table Of Contents