Source code for fmu.ensemble.ensemblecombination

"""Module for handling linear combinations of ensembles"""

import fnmatch
import logging

import pandas as pd

logger = logging.getLogger(__name__)



[docs]
class EnsembleCombination(object):
    """The class is used to perform linear operations on ensembles.

    When instantiated, the linear combination will not actually be
    computed before the results are actually asked for - lazy
    evaluation.
    """

    def __init__(self, ref, scale=None, add=None, sub=None):
        """Set up an object for a linear combination of ensembles.

        Each instance of this object can only hold one operation,
        either addition/substraction of two
        ensembles/ensemblecombinations or a scaling of one.

        ScratchEnsembles and VirtualEnsembles can be combined freely.

        A long expression of ensembles will lead to an evaluation tree
        consisting of instances of this class with actual ensembles
        at the leaf nodes.

        Args:
            scale: float for scaling the ensemble or ensemblecombination
            add: ensemble or ensemblecombinaton with a positive sign
            sub: ensemble or ensemblecombination with a negative sign.

        """
        self.ref = ref
        if scale:
            self.scale = scale
        else:
            self.scale = 1

        if add:
            self.add = add
        else:
            self.add = None

        # Alternatively, substraction could be implemented as a combination
        # of __mult__ and __add__
        if sub:
            self.sub = sub
        else:
            self.sub = None


[docs]
    def keys(self):
        """Return the intersection of all keys available in reference
        ensemble(combination) and the other
        """
        combkeys = set()
        combkeys = combkeys.union(self.ref.keys())
        if self.add:
            combkeys = combkeys.intersection(self.add.keys())
        if self.sub:
            combkeys = combkeys.intersection(self.sub.keys())
        return combkeys



[docs]
    def get_df(self, localpath, merge=None):
        """Obtain given data from the ensemblecombination,
        doing the actual computation of ensemble on the fly.

        Warning: In order to add dataframes together with meaning,
        using pandas.add, the index of the frames must be correctly set,
        and this can be tricky for some datatypes (f.ex. volumetrics table
        where you want to add together volumes for correct zone
        and fault segment).

        If you have the columns "REAL", "DATE", "ZONE" and/or "REGION", it
        will be regarded as an index column.

        Args:
            localpath (str): refers to the internalized name of the
                data wanted in each ensemble.
            merge (list or str): Optional data to be merged in for the data
                The merge will happen as deep as possible (in realization
                objects in case of ScratchEnsembles), and all ensemble
                combination computations happen after merging. Be careful
                with index guessing and merged data.
        """
        # We can pandas.add when the index is set correct.
        # WE MUST GUESS!
        indexlist = []
        indexcandidates = ["REAL", "DATE", "ZONE", "REGION"]
        for index in indexcandidates:
            if index in self.ref.get_df(localpath).columns:
                indexlist.append(index)
        logger.debug("get_df() inferred index columns to %s", str(indexlist))
        refdf = self.ref.get_df(localpath, merge=merge).set_index(indexlist)
        refdf = refdf.select_dtypes(include="number")
        result = refdf.mul(self.scale)
        if self.add:
            otherdf = self.add.get_df(localpath, merge=merge).set_index(indexlist)
            otherdf = otherdf.select_dtypes(include="number")
            result = result.add(otherdf)
        if self.sub:
            otherdf = self.sub.get_df(localpath, merge=merge).set_index(indexlist)
            otherdf = otherdf.select_dtypes(include="number")
            result = result.sub(otherdf)
        # Delete rows where everything is NaN, which will be case when
        # realization (multi-)indices does not match up in both ensembles.
        result.dropna(axis="index", how="all", inplace=True)
        # Also delete columns where everything is NaN, happens when
        # column data are not similar
        result.dropna(axis="columns", how="all", inplace=True)
        return result.reset_index()



[docs]
    def to_virtual(self, keyfilter=None):
        """Evaluate the current linear combination and return as
        a virtual ensemble.

        Args:
            keyfilter (list or str): If supplied, only keys matching wildcards
                in this argument will be included. Use this for speed reasons
                when only some data is needed. Default is to include everything.
                If you supply "unsmry", it will match every key that
                includes this string by prepending and appending '*' to your pattern

        Returns:
            VirtualEnsemble
        """
        # pylint: disable=import-outside-toplevel
        from .virtualensemble import VirtualEnsemble

        if keyfilter is None:
            keyfilter = "*"
        if isinstance(keyfilter, str):
            keyfilter = [keyfilter]
        if not isinstance(keyfilter, list):
            raise TypeError("keyfilter in to_virtual() must be list or string")

        vens = VirtualEnsemble(name=str(self))
        for key in self.keys():
            if sum(
                [fnmatch.fnmatch(key, "*" + pattern + "*") for pattern in keyfilter]
            ):
                logger.info("Calculating ensemblecombination on %s", key)
                vens.append(key, self.get_df(key))
        vens.update_realindices()
        return vens



[docs]
    def get_smry_dates(
        self, freq="monthly", normalize=True, start_date=None, end_date=None
    ):
        """Create a union of dates available in the
        involved ensembles
        """
        dates = set(self.ref.get_smry_dates(freq, normalize, start_date, end_date))
        if self.add:
            dates = dates.union(
                set(self.add.get_smry_dates(freq, normalize, start_date, end_date))
            )
        if self.sub:
            dates = dates.union(
                set(self.sub.get_smry_dates(freq, normalize, start_date, end_date))
            )
        dates = list(dates)
        dates.sort()
        return dates



[docs]
    def get_smry(self, column_keys=None, time_index=None):
        """
        Loads the Eclipse summary data directly from the underlying
        ensemble data. The ensembles can be ScratchEnsemble or
        VirtualEnsemble, if scratch it will access binary
        summary files directly, if virtual ensembles, summary
        data must have been loaded earlier.

        Args:
            column_keys (str or list): column key wildcards. Default
                is '*', which will match all vectors in the Eclipse
                output.
            time_index (str or list of DateTime): time_index mnemonic or
                a list of explicit datetime at which the summary data
                is requested (interpolated or extrapolated)

        Returns:
            pd.DataFrame. Indexed by rows, has at least the columns REAL
                and DATE if not empty.
        """
        if isinstance(time_index, str):
            time_index = self.get_smry_dates(time_index)
        indexlist = ["REAL", "DATE"]
        refdf = self.ref.get_smry(
            time_index=time_index, column_keys=column_keys
        ).set_index(indexlist)
        result = refdf.mul(self.scale)
        if self.add:
            otherdf = self.add.get_smry(
                time_index=time_index, column_keys=column_keys
            ).set_index(indexlist)
            result = result.add(otherdf)
        if self.sub:
            otherdf = self.sub.get_smry(
                time_index=time_index, column_keys=column_keys
            ).set_index(indexlist)
            result = result.sub(otherdf)
        return result.reset_index()



[docs]
    def get_smry_stats(self, column_keys=None, time_index="monthly"):
        """
        Function to extract the ensemble statistics (Mean, Min, Max, P10, P90)
        for a set of simulation summary vectors (column key).

        Compared to the agg() function, this function only works on summary
        data (time series), and will only operate on actually requested data,
        independent of what is internalized. It accesses the summary files
        directly and can thus obtain data at any time frequency.

        Args:
            column_keys: list of column key wildcards
            time_index: list of DateTime if interpolation is wanted
               default is None, which returns the raw Eclipse report times
               If a string is supplied, that string is attempted used
               via get_smry_dates() in order to obtain a time index.
        Returns:
            A MultiLevel dataframe. Outer index is 'minimum', 'maximum',
            'mean', 'p10', 'p90', inner index are the dates. Column names
            are the different vectors. Quantiles follow the scientific
            standard, opposite to the oil industry standard.

        TODO: add warning message when failed realizations are removed
        """
        # Obtain an aggregated dataframe for only the needed columns over
        # the entire ensemble.

        dframe = (
            self.get_smry(time_index=time_index, column_keys=column_keys)
            .drop(columns="REAL")
            .groupby("DATE")
        )
        mean = dframe.mean()
        p90 = dframe.quantile(q=0.90)
        p10 = dframe.quantile(q=0.10)
        maximum = dframe.max()
        minimum = dframe.min()

        return pd.concat(
            [mean, p10, p90, maximum, minimum],
            keys=["mean", "p10", "p90", "maximum", "minimum"],
            names=["statistic"],
            sort=False,
        )



[docs]
    def get_smry_meta(self, column_keys=None):
        """
        Provide metadata for summary data vectors.

        A dictionary indexed by summary vector names is returned, and each
        value is another dictionary with potentially the metadata types:
        * unit (string)
        * is_total (bool)
        * is_rate (bool)
        * is_historical (bool)
        * get_num (int) (only provided if not None)
        * keyword (str)
        * wgname (str or None)

        Args:
            column_keys: List or str of column key wildcards
        """
        meta = self.ref.get_smry_meta(column_keys=column_keys)
        if self.add:
            meta.update(self.add.get_smry_meta(column_keys=column_keys))
        if self.sub:
            meta.update(self.sub.get_smry_meta(column_keys=column_keys))
        return meta



[docs]
    def agg(self, aggregation, keylist=None, excludekeys=None):
        """Aggregator, this is a wrapper that will
        call .to_virtual() on your behalf and call the corresponding
        agg() in VirtualEnsemble.
        """
        return self.to_virtual().agg(aggregation, keylist, excludekeys)



[docs]
    def get_volumetric_rates(
        self, column_keys=None, time_index="monthly", time_unit=None
    ):
        """Compute volumetric rates from cumulative summary
        vectors.

        Column names that are not referring to cumulative summary
        vectors are silently ignored.

        A Dataframe is returned with volumetric rates, that is rate
        values that can be summed up to the cumulative version. The
        'T' in the column name is switched with 'R'. If you ask for
        FOPT, you will get FOPR in the returned dataframe.

        Rates in the returned dataframe are valid **forwards** in time,
        opposed to rates coming directly from the Eclipse simulator which
        are valid backwards in time.

        If time_unit is set, the rates will be scaled to represent
        either daily, monthly or yearly rates. These will sum up to the
        cumulative as long as you multiply with the correct number
        of days, months or year between each consecutive date index.
        Month lengths and leap years are correctly handled.

        Args:
            column_keys: str or list of strings, cumulative summary vectors
            time_index: str or list of datetimes
            time_unit: str or None. If None, the rates returned will
                be the difference in cumulative between each included
                time step (where the time interval can vary arbitrarily)
                If set to 'days', 'months' or 'years', the rates will
                be scaled to represent a daily, monthly or yearly rate that
                is compatible with the date index and the cumulative data.

        """
        return self.to_virtual(keyfilter="unsmry").get_volumetric_rates(
            column_keys=column_keys, time_index=time_index, time_unit=time_unit
        )


    @property
    def parameters(self):
        """Return parameters from the ensemble as a class property"""
        try:
            return self.get_df("parameters.txt")
        except KeyError:
            return pd.DataFrame()

    def __len__(self):
        """Estimate the number of realizations in this
        ensemble combinations.

        This is not always well defined in cases of strange
        combinations of which data is available in which realization,
        so after actual computation of a virtual ensemble, the number
        of realizations can be less that what this estimate returns

        Returns:
            int, number of realizations (upper limit)
        """
        return len(self.get_realindices())


[docs]
    def get_realindices(self):
        """Return the integer indices for realizations in this ensemble

        There is no guarantee that all realizations returned here
        will be valid for all datatypes after computation.

        Returns:
            list of integers
        """
        indices = set(self.ref.get_realindices())
        if self.add:
            indices = indices.intersection(set(self.add.get_realindices()))
        if self.sub:
            indices = indices.intersection(set(self.sub.get_realindices()))
        return list(indices)


    def __getitem__(self, localpath):
        return self.get_df(localpath)

    def __repr__(self):
        """Try to give out a linear expression"""
        # NB: Implementation in this method requires scaling not to happen
        # simultaneously as adds or subs.
        scalestring = ""
        addstring = ""
        substring = ""
        if self.scale != 1:
            scalestring = str(self.scale) + " * "
        if self.add:
            addstring = " + " + str(self.add)
        if self.sub:
            substring = " - " + str(self.sub)
        return scalestring + str(self.ref) + addstring + substring

    def __sub__(self, other):
        """Substract another ensemble from this combination"""
        return EnsembleCombination(self, sub=other)

    def __add__(self, other):
        """Add another ensemble from this combination"""
        return EnsembleCombination(self, add=other)

    def __radd__(self, other):
        """Add another ensemble from this combination"""
        return EnsembleCombination(self, add=other)

    def __rsub__(self, other):
        """Substract another ensemble from this combination"""
        return EnsembleCombination(self, sub=other)

    def __mul__(self, other):
        """Scale this EnsembleCombination by a scalar value"""
        return EnsembleCombination(self, scale=float(other))

    def __rmul__(self, other):
        """Scale this EnsembleCombination by a scalar value"""
        return EnsembleCombination(self, scale=float(other))