Source code for HErmes.selection

"""
Provides containers for in-memory variable. These containers are called "categroies",
and they represent a set of variables for a certain type of data. Categories can
be further grouped into "Datasets". Variables can be read out from files and stored
in memory in the form of numpy arrays or pandas DataSeries/DataFrames. Selection criteria
can be applied simultaniously (and reversibly) to all categories in a dataset with the "Cut"
class.

HErmes.selection provides the following submodules:

- `categories` : Container classes for variables.

- `dataset` : Grouping categories together.

- `cut` : Apply selection criteria on variables in a category.

- `variables` : Variable definition. Harvest variables from files.

- `magic_keywords` : A bunch of fixed names for automatic weight calculation.


"""
from __future__ import absolute_import

import hjson
import os
import os.path
import inspect
import importlib
import re

from ..utils.logger import Logger

from . import categories as c
from . import dataset as ds
from ..icecube_goodies import weighting as wgt
from ..analysis import fluxes as fluxes

[docs]def load_dataset(config, variables=None, max_cpu_cores=c.MAX_CORES):
    """
    Read a json configuration file and load a dataset populated
    with variables from the files given in the configuration file.

    Args:
        config (str/dict): json style config file or dict

    Keyword Args:
        variables (list): list of strings of variable names to read out
        max_cpu_cores (int): maximum number of cpu ucores to use for variable readout
    Returns:
        HErmes.selection.dataset.Dataset

    """
    cfg = config
    if not isinstance(config, dict):
        assert os.path.exists(config), "Config file {} does not exist!".format(config)
        cfg = hjson.load(open(config))

    categories = dict()
    weightfunctions = dict()
    models = dict()
    model_args = dict()
    files_basepath   = cfg["files_basepath"]
    for cat in list(cfg["categories"].keys()):
        thiscat = cfg["categories"][cat]
        sanitizer = lambda x: True

        if "file_regex" in thiscat:
            to_sanitize = thiscat["file_regex"]
            pattern = re.compile(to_sanitize)
            Logger.debug("Will look for files with pattern {}".format(pattern)) 
            def sanitizer(x):
                result = pattern.search(x)
                if result is None:
                    return False
                else:
                    return True
                #if to_sanitize in x:
                #    return True
                #else: 
                #    return False

        if thiscat["datatype"] == "simulation":
            categories[cat] = c.Simulation(cat)
            # remember that json keys are strings, so 
            # convert to int
            datasets = {}
            if "datasets" in thiscat:
                datasets = {int(x): int(thiscat['datasets'][x]) for x in thiscat['datasets']}

            
            categories[cat].get_files(os.path.join(files_basepath,\
                                                   thiscat['subpath']),\
                                                   prefix=thiscat["file_prefix"],\
                                                   datasets=datasets,\
                                                   sanitizer=sanitizer,\
                                                   ending=thiscat["file_type"])

            #weightfunctions[cat] = dict(inspect.getmembers(wgt))[thiscat["model_method"]]
            if not "model" in thiscat:
                models[cat] = None
                model_args[cat] = [None]
            else:
                try:
                    fluxclass, flux = thiscat["model"].split(".")
                    #fluxclass = thiscat["model"]
                    #flux = "__call__"
                    models[cat] = getattr(dict(inspect.getmembers(fluxes))[fluxclass],flux)
                    model_args[cat] = thiscat["model_args"]
                except ValueError:
                    Logger.warning("{} does not seem to be a valid model for {}. This might cause troubles. If not, it is probably fine!".format(thiscat["model"],cat))
                    models[cat] = None

        elif thiscat["datatype"] == "data":
            categories[cat] = c.Data(cat)
            categories[cat].get_files(os.path.join(files_basepath,thiscat['subpath']),\
                                      prefix=thiscat["file_prefix"],\
                                      sanitizer=sanitizer,\
                                      ending=thiscat["file_type"])
            #models[cat] = float(thiscat["livetime"])
            #weightfunctions[cat] = dict(inspect.getmembers(wgt))[thiscat["model_method"]]
            
        elif thiscat["datatype"] == "reweighted":
            pass
        else:
            raise TypeError("Data type not understood. Has to be either 'simulation', 'reweighted' or 'data'!!")

    # at last we can take care of reweighted categories
    for cat in list(cfg["categories"].keys()):
        thiscat = cfg["categories"][cat]
        if thiscat["datatype"] == "reweighted":
            categories[cat] = c.ReweightedSimulation(cat,categories[thiscat["parent"]])
            #if thiscat["model"]:
            #    fluxclass, flux = thiscat["model"].split(".")
            #    models[cat] = getattr(dict(inspect.getmembers(fluxes))[fluxclass],flux)
            #    weightfunctions[cat] = dict(inspect.getmembers(wgt))[thiscat["model_method"]]
        elif thiscat["datatype"] in ["data", "simulation"]:
            pass
        else:
            raise TypeError("Data type not understood. Has to be either 'simulation', 'reweighted' or 'data'!!")

    for cat in categories:
        if isinstance(categories[cat], c.Data):
            continue
        if not "weights" in  cfg["categories"][cat]:
            continue
        categories[cat].weightvarname = cfg["categories"][cat]["weights"]

    #combined categories
    combined_categories = dict() 
    for k in list(combined_categories.keys()):
        combined_categories[k] = [categories[l] for l in cfg["combined_categories"]]

    # import variable defs
    vardefs = cfg["variable_definitions"]
    if isinstance(vardefs, str) or isinstance(vardefs, unicode):
        vardefs = importlib.import_module(cfg["variable_definitions"])
    elif isinstance(vardefs, dict):
        vardefs = {}
        for k in cfg["variable_definitions"]:
            vardefs[k] = importlib.import_module(cfg["variable_definitions"][k])
    else:
        raise ValueError("Can not understand variable definitions {} of type {}".\
                         format(vardefs, type(vardefs)))
    #vardefs = importlib.import_module(cfg["variable_definitions"])
    dataset = ds.Dataset(*list(categories.values()),\
                         combined_categories=combined_categories)

    dataset.load_vardefs(vardefs)
    dataset.read_variables(names=variables, max_cpu_cores=max_cpu_cores)
    #dataset.set_weightfunction(weightfunctions)
    #dataset.get_weights(models=models)
    dataset.calculate_weights(model=models, model_args=model_args)
    plot_dict = {}
    for k in cfg["categories"]:
        if "plotting" in cfg["categories"][k]:
            plot_dict[k] = cfg["categories"][k]["plotting"]
    dataset.set_default_plotstyles(plot_dict)
    return dataset