Source code for pyanno.annotations

# Copyright (c) 2011, Enthought, Ltd.
# Author: Pietro Berkes <pberkes@enthought.com>
# License: Modified BSD license (2-clause)

"""Defines objects to create and manipulate raw annotations."""

from traits.has_traits import HasStrictTraits, cached_property
from traits.trait_numeric import Array
from traits.trait_types import Str, List, Int
from traits.traits import Property

import numpy as np
from pyanno.util import MISSING_VALUE, PyannoValueError


def _robust_isnan(x):
    res = False

    # workaround for the fact that np.isnan is not defined for non-numerical
    # type, e.g. strings
    try:
        res = np.isnan(x)
    except NotImplementedError:
        pass

    return res


def _is_nan_in_list(lst):
    return np.any([_robust_isnan(el) for el in lst])


[docs]class AnnotationsContainer(HasStrictTraits):
    """Translate from general annotations files and arrays to pyAnno's format.

    This class exposes a few methods to import data from files and arrays, and
    converts them to pyAnno's format:

    * annotations are 2D integer arrays; rows index items, and columns
      annotators

    * label classes are numbered 0 to :attr:`nclasses`-1 . The attribute
      :attr:`labels` defines a mapping from label tokens to label classes

    * missing values are defined as :attr:`pyanno.util.MISSING_VALUE`. The
      attribute :attr:`missing_values` contains the missing values tokens
      found in the original, raw data

    The converted data can be accessed through the :attr:`annotations` property.

    The `AnnotationsContainer` is also used as the format to store annotations
    in :class:`~pyanno.database.PyannoDatabase` objects.
    """

    DEFAULT_MISSING_VALUES_STR = ['-1', 'NA', 'None', '*']
    DEFAULT_MISSING_VALUES_NUM = [-1, np.nan, None]
    DEFAULT_MISSING_VALUES_ALL = (DEFAULT_MISSING_VALUES_STR +
                                  DEFAULT_MISSING_VALUES_NUM)

    #: raw annotations, as they are imported from file or array
    raw_annotations = List(List)

    #: name of file or array from which the annotations were imported
    name = Str

    #: list of all labels found in file/array
    labels = List

    #: labels corresponding to a missing value
    missing_values = List

    #: number of classes found in the annotations
    nclasses = Property(Int, depends_on='labels')
    def _get_nclasses(self):
        return len(self.labels)

    #: number of annotators
    nannotators = Property(Int, depends_on='raw_annotations')
    def _get_nannotators(self):
        return len(self.raw_annotations[0])

    #: number of annotations
    nitems = Property(Int, depends_on='raw_annotations')
    def _get_nitems(self):
        return len(self.raw_annotations)

    #: annotations in pyAnno format
    annotations = Property(Array, depends_on='raw_annotations')

    @cached_property
    def _get_annotations(self):
        nitems, nannotators = len(self.raw_annotations), self.nannotators
        anno = np.empty((nitems, nannotators), dtype=int)

        # build map from labels and missing values to annotation values
        raw2val = dict(zip(self.labels, range(self.nclasses)))
        raw2val.update([(mv, MISSING_VALUE) for mv in self.missing_values])

        # translate
        nan_in_missing_values = _is_nan_in_list(self.missing_values)
        for i, row in enumerate(self.raw_annotations):
            for j, lbl in enumerate(row):
                if nan_in_missing_values and _robust_isnan(lbl):
                    # workaround for the fact that np.nan cannot be used as
                    # the key to a dictionary, since np.nan != np.nan
                    anno[i,j] = MISSING_VALUE
                else:
                    anno[i,j] = raw2val[lbl]

        return anno


    @staticmethod
    def _from_generator(rows_generator, missing_values, name=''):

        missing_set = set(missing_values)
        labels_set = set()

        raw_annotations = []
        nannotators = None
        for n, row in enumerate(rows_generator):

            # verify that number of lines is consistent in the whole file
            if nannotators is None: nannotators = len(row)
            else:
                if len(row) != nannotators:
                    raise PyannoValueError(
                        'File has inconsistent number of entries '
                        'on separate lines (line {})'.format(n))

            raw_annotations.append(row)
            labels_set.update(row)

        # remove missing values from set of labels
        all_labels = sorted(list(labels_set - missing_set))
        missing_values = sorted(list(missing_set & labels_set))

        # workaround for np.nan != np.nan, so intersection does not work
        if _is_nan_in_list(all_labels):
            # uses fact that np.nan < x, for every x
            all_labels = all_labels[1:]
            missing_values.insert(0, np.nan)

        # create annotations object
        anno = AnnotationsContainer(
            raw_annotations = raw_annotations,
            labels = all_labels,
            missing_values = missing_values,
            name = name
        )

        return anno

    @staticmethod
    def _from_file_object(fobj, missing_values=None, name=''):
        """Useful for testing, as it can be called using a StringIO object.
        """

        if missing_values is None:
            missing_values = AnnotationsContainer.DEFAULT_MISSING_VALUES_STR

        # generator for rows of file-like object
        def file_row_generator():
            for line in fobj.readlines():
                # remove commas and split in individual tokens
                line = line.strip().replace(',', ' ')

                # ignore empty lines
                if len(line) == 0: continue

                labels = line.split()
                yield labels

        return AnnotationsContainer._from_generator(file_row_generator(),
                                           missing_values,
                                           name=name)


    @staticmethod
[docs]    def from_file(filename, missing_values=None):
        """Load annotations from a file.

        The file is a text file with a columns separated by spaces and/or
        commas, and rows on different lines.

        Arguments
        ---------
        filename : string
            File name

        missing_values : list
            List of labels that are considered missing values.
            Default is :attr:`DEFAULT_MISSING_VALUES_STR`
        """

        if missing_values is None:
            missing_values = AnnotationsContainer.DEFAULT_MISSING_VALUES_STR

        with open(filename) as fh:
            anno = AnnotationsContainer._from_file_object(fh,
                                                 missing_values=missing_values,
                                                 name=filename)

        return anno


    @staticmethod
[docs]    def from_array(x, missing_values=None, name=''):
        """Create an annotations object from an array or list-of-lists.

        Arguments
        ---------
        x : ndarray or list-of-lists
            Array or list-of-lists containing numerical or string annotations

        missing_values : list
            List of values that are considered missing values.
            Default is :attr:`DEFAULT_MISSING_VALUES_ALL`

        name : string
            Name of the annotations (for user interaction and used as key in
            databases).
        """

        if missing_values is None:
            missing_values = AnnotationsContainer.DEFAULT_MISSING_VALUES_ALL

        # generator for array objects
        def array_rows_generator():
            for row in x:
                yield list(row)

        return AnnotationsContainer._from_generator(array_rows_generator(),
                                           missing_values, name=name)


[docs]    def save_to(self, filename, set_name=False):
        """Save raw annotations to file.

        Arguments
        ---------
        filename : string
            File name

        set_name : bool
            Set the :attr:`name` of the annotation container to the file name
        """
        if set_name:
            self.name = filename
        with open(filename, 'w') as f:
            f.writelines(
                (' '.join(map(str, row))+'\n'
                 for row in self.raw_annotations)
            )


[docs]def load_annotations(filename, missing_values=None):
    """Load annotations from file.

    The file is a text file with a columns separated by spaces and/or
    commas, and rows on different lines.

    Arguments
    ---------
    filename : string
        File name

    missing_values : list
       List of labels that are considered missing values.
       Default is
       :attr:`~pyanno.AnnotationsContainer.DEFAULT_MISSING_VALUES_STR`

    """
    anno = AnnotationsContainer.from_file(filename,
                                          missing_values=missing_values)
    return anno.annotations
Navigation

Source code for pyanno.annotations

Table Of Contents

Quick search

Navigation