Source code for pyanno.annotations

# Copyright (c) 2011, Enthought, Ltd.
# Author: Pietro Berkes <pberkes@enthought.com>
# License: Modified BSD license (2-clause)

"""Defines objects to create and manipulate raw annotations."""

from traits.has_traits import HasStrictTraits, cached_property
from traits.trait_numeric import Array
from traits.trait_types import Str, List, Int
from traits.traits import Property

import numpy as np
from pyanno.util import MISSING_VALUE, PyannoValueError


def _robust_isnan(x):
    res = False

    # workaround for the fact that np.isnan is not defined for non-numerical
    # type, e.g. strings
    try:
        res = np.isnan(x)
    except NotImplementedError:
        pass

    return res


def _is_nan_in_list(lst):
    return np.any([_robust_isnan(el) for el in lst])


[docs]class AnnotationsContainer(HasStrictTraits): """Translate from general annotations files and arrays to pyAnno's format. This class exposes a few methods to import data from files and arrays, and converts them to pyAnno's format: * annotations are 2D integer arrays; rows index items, and columns annotators * label classes are numbered 0 to :attr:`nclasses`-1 . The attribute :attr:`labels` defines a mapping from label tokens to label classes * missing values are defined as :attr:`pyanno.util.MISSING_VALUE`. The attribute :attr:`missing_values` contains the missing values tokens found in the original, raw data The converted data can be accessed through the :attr:`annotations` property. The `AnnotationsContainer` is also used as the format to store annotations in :class:`~pyanno.database.PyannoDatabase` objects. """ DEFAULT_MISSING_VALUES_STR = ['-1', 'NA', 'None', '*'] DEFAULT_MISSING_VALUES_NUM = [-1, np.nan, None] DEFAULT_MISSING_VALUES_ALL = (DEFAULT_MISSING_VALUES_STR + DEFAULT_MISSING_VALUES_NUM) #: raw annotations, as they are imported from file or array raw_annotations = List(List) #: name of file or array from which the annotations were imported name = Str #: list of all labels found in file/array labels = List #: labels corresponding to a missing value missing_values = List #: number of classes found in the annotations nclasses = Property(Int, depends_on='labels') def _get_nclasses(self): return len(self.labels) #: number of annotators nannotators = Property(Int, depends_on='raw_annotations') def _get_nannotators(self): return len(self.raw_annotations[0]) #: number of annotations nitems = Property(Int, depends_on='raw_annotations') def _get_nitems(self): return len(self.raw_annotations) #: annotations in pyAnno format annotations = Property(Array, depends_on='raw_annotations') @cached_property def _get_annotations(self): nitems, nannotators = len(self.raw_annotations), self.nannotators anno = np.empty((nitems, nannotators), dtype=int) # build map from labels and missing values to annotation values raw2val = dict(zip(self.labels, range(self.nclasses))) raw2val.update([(mv, MISSING_VALUE) for mv in self.missing_values]) # translate nan_in_missing_values = _is_nan_in_list(self.missing_values) for i, row in enumerate(self.raw_annotations): for j, lbl in enumerate(row): if nan_in_missing_values and _robust_isnan(lbl): # workaround for the fact that np.nan cannot be used as # the key to a dictionary, since np.nan != np.nan anno[i,j] = MISSING_VALUE else: anno[i,j] = raw2val[lbl] return anno @staticmethod def _from_generator(rows_generator, missing_values, name=''): missing_set = set(missing_values) labels_set = set() raw_annotations = [] nannotators = None for n, row in enumerate(rows_generator): # verify that number of lines is consistent in the whole file if nannotators is None: nannotators = len(row) else: if len(row) != nannotators: raise PyannoValueError( 'File has inconsistent number of entries ' 'on separate lines (line {})'.format(n)) raw_annotations.append(row) labels_set.update(row) # remove missing values from set of labels all_labels = sorted(list(labels_set - missing_set)) missing_values = sorted(list(missing_set & labels_set)) # workaround for np.nan != np.nan, so intersection does not work if _is_nan_in_list(all_labels): # uses fact that np.nan < x, for every x all_labels = all_labels[1:] missing_values.insert(0, np.nan) # create annotations object anno = AnnotationsContainer( raw_annotations = raw_annotations, labels = all_labels, missing_values = missing_values, name = name ) return anno @staticmethod def _from_file_object(fobj, missing_values=None, name=''): """Useful for testing, as it can be called using a StringIO object. """ if missing_values is None: missing_values = AnnotationsContainer.DEFAULT_MISSING_VALUES_STR # generator for rows of file-like object def file_row_generator(): for line in fobj.readlines(): # remove commas and split in individual tokens line = line.strip().replace(',', ' ') # ignore empty lines if len(line) == 0: continue labels = line.split() yield labels return AnnotationsContainer._from_generator(file_row_generator(), missing_values, name=name) @staticmethod
[docs] def from_file(filename, missing_values=None): """Load annotations from a file. The file is a text file with a columns separated by spaces and/or commas, and rows on different lines. Arguments --------- filename : string File name missing_values : list List of labels that are considered missing values. Default is :attr:`DEFAULT_MISSING_VALUES_STR` """ if missing_values is None: missing_values = AnnotationsContainer.DEFAULT_MISSING_VALUES_STR with open(filename) as fh: anno = AnnotationsContainer._from_file_object(fh, missing_values=missing_values, name=filename) return anno
@staticmethod
[docs] def from_array(x, missing_values=None, name=''): """Create an annotations object from an array or list-of-lists. Arguments --------- x : ndarray or list-of-lists Array or list-of-lists containing numerical or string annotations missing_values : list List of values that are considered missing values. Default is :attr:`DEFAULT_MISSING_VALUES_ALL` name : string Name of the annotations (for user interaction and used as key in databases). """ if missing_values is None: missing_values = AnnotationsContainer.DEFAULT_MISSING_VALUES_ALL # generator for array objects def array_rows_generator(): for row in x: yield list(row) return AnnotationsContainer._from_generator(array_rows_generator(), missing_values, name=name)
[docs] def save_to(self, filename, set_name=False): """Save raw annotations to file. Arguments --------- filename : string File name set_name : bool Set the :attr:`name` of the annotation container to the file name """ if set_name: self.name = filename with open(filename, 'w') as f: f.writelines( (' '.join(map(str, row))+'\n' for row in self.raw_annotations) )
[docs]def load_annotations(filename, missing_values=None): """Load annotations from file. The file is a text file with a columns separated by spaces and/or commas, and rows on different lines. Arguments --------- filename : string File name missing_values : list List of labels that are considered missing values. Default is :attr:`~pyanno.AnnotationsContainer.DEFAULT_MISSING_VALUES_STR` """ anno = AnnotationsContainer.from_file(filename, missing_values=missing_values) return anno.annotations

Table Of Contents