Source code for analysis.Analysis

from __future__ import print_function
import os
import numpy as np
import logging
import pdb

from fileops import pathops

logger = logging.getLogger(__name__)

[docs]class Analysis(object):

    """
    Basic descriptor class to build analyses on.

    The Analysis base class works as an interface between child descriptor
    objects and the HDF5 storage file. This is designed to seperate descriptor
    generation from data IO, allowing for quick development of new descriptor
    classes.  The base Analysis class has methods for retreiving analyses from
    file and saving data created by analysis objects to file. It also has basic
    formatting methods used to return data in the required format for processed
    such as descriptor comparisons.

    In order to create a new descriptor, the hdf5_dataset_formatter method will
    need to be overwritten by the child class to generate and store the
    descriptor's output in the appropriate manner. Examples of this can be seen
    through the currently implemented descriptors.
    """

    def __init__(self, AnalysedAudioFile, frames, analysis_group, name, config=None):
        # Create object logger
        self.logger = logging.getLogger(__name__ + '.{0}Analysis'.format(name))
        # Store AnalysedAudioFile object to be analysed.
        self.AnalysedAudioFile = AnalysedAudioFile
        self.analysis_group = analysis_group
        self.name = name

    def create_analysis(self, *args, **kwargs):
        """
        Create the analysis and save to the HDF5 file.

        analysis_function: The function used to create the analysis. returned
        data will be stored in the HDF5 file.
        """

        try:
            self.analysis = self.analysis_group.create_group(self.name)
        except ValueError:
            self.logger.info("{0} analysis group already exists".format(self.name))
            self.analysis = self.analysis_group[self.name]

        # If forcing new analysis creation then delete old analysis and create
        # a new one
        if self.AnalysedAudioFile.force_analysis:
            self.logger.info("Force re-analysis is enabled. "
                                "deleting: {0}".format(self.analysis.name))
            # Delete all pre-existing data in database.
            for i in self.analysis.iterkeys():
                del self.analysis[i]
            # Run the analysis function and format it's returned data ready to
            # be saved in the HDF5 file
            data_dict, attrs_dict = self.hdf5_dataset_formatter(*args, **kwargs)
            for key, value in data_dict.iteritems():
                self.analysis.create_dataset(key, data=value, chunks=True)
            for key, value in attrs_dict.iteritems():
                self.analysis.attrs[key] = value
        else:

            if self.analysis.keys():
                self.logger.info("Analysis already exists. Reading from: "
                                 "{0}".format(self.analysis.name))
            else:
                # If it doesn't then generate a new file
                # Run the analysis function and format it's returned data ready to
                # be saved in the HDF5 file
                data_dict, attrs_dict = self.hdf5_dataset_formatter(*args, **kwargs)
                for key, value in data_dict.iteritems():
                    self.analysis.create_dataset(key, data=value, chunks=True)
                for key, value in attrs_dict.iteritems():
                    self.analysis.attrs[key] = value

    def get_analysis_grains(self, start, end):
        """
        Retrieve analysis frames for period specified in start and end times.
        arrays of start and end time pairs will produce an array of equivelant
        size containing frames for these times.
        """
        times = self.analysis_group[self.name]["times"][:]
        start = start / 1000
        end = end / 1000
        vtimes = times.reshape(-1, 1)

        selection = np.transpose((vtimes >= start) & (vtimes <= end))
        # If there are no frames for this grain, take the two closest frames
        # from the adjacent grains.
        if not selection.any():
            frame_center = start + (end-start)/2.
            closest_frames = np.abs(vtimes-frame_center).argsort()[:2]
            selection[closest_frames] = True

        #start_ind = np.min(selection)
        #end_ind = np.argmax(selection)
        frames = self.analysis_group[self.name]["frames"][:]

        grain_data = (frames, selection)

        return grain_data

    def hdf5_dataset_formatter(analysis_method, *args, **kwargs):
        '''
        Note: This is a generic formatter designed as a template to be
        overwritten by a descriptor sub-class.

        Formats the output from the analysis method to save to the HDF5 file.

        Places data and attributes in 2 dictionaries to be stored in the HDF5
        file.
        '''
        output, attributes = analysis_method(*args, **kwargs)
        return ({'data': output}, {'attrs': attributes})

    ################################################################################
    # Formatting functions
    ################################################################################

    def log2_median(self, x):
        return np.median(1000 * np.log2(1+x/1000))

    def log2_mean(self, x):
        return np.mean(1000 * np.log2(1+x/1000))

    def formatter_func(self, selection, frames, valid_inds, formatter=None):
        # get all valid frames from current grain
        frames = frames[selection & valid_inds]

        return formatter(frames)
        #if less than half the frames are valid then the grain is not valid.
        if frames.size < valid_inds[selection].nonzero()[0].size/2:
            return np.nan

    def analysis_formatter(self, frames, selection, format):
        """Calculate the average analysis value of the grain using the match format specified."""
        valid_inds = np.isfinite(frames)

        format_style_dict = {
            'mean': np.mean,
            'median': np.median,
            'log2_mean': self.log2_mean,
            'log2_median': self.log2_median,
        }
        output = np.empty(len(selection))

        if not selection.size:
            # TODO: Add warning here
            return np.nan
        # For debugging apply_along_axis:
        #for ind, i in enumerate(selection):
        #    output[ind] = self.formatter_func(i, frames, valid_inds, formatter=format_style_dict[format])

        output = np.apply_along_axis(self.formatter_func, 1, selection, frames, valid_inds, formatter=format_style_dict[format])
        return output