"""
Container classes for variables
"""
import numpy as np
import os
import pandas as pd
import tables
import abc
import enum
import array
import numbers
from ..utils import files as f
from ..utils import Logger
from copy import deepcopy as copy
DEFAULT_BINS = 70
REGISTERED_FILEEXTENSIONS = [".h5"]
try:
import uproot as ur
import uproot_methods.classes.TVector3 as TVector3
import uproot_methods.classes.TLorentzVector as TLorentzVector
import uproot_methods.classes.TH1
REGISTERED_FILEEXTENSIONS.append(".root")
except ImportError:
Logger.warning("No uproot found, root support is limited!")
################################################################
# define a non-member function so that it can be used in a
# multiprocessing approach
################################################################
# define a non-member function so that it can be used in a
# multiprocessing approach
[docs]def harvest(filenames, definitions, **kwargs):
"""
Read variables from files into memory. Will be used by HErmes.selection.variables.Variable.harvest
This will be run multi-threaded. Keep that in mind, arguments have to be picklable,
also everything thing which is read out must be picklable. Lambda functions are NOT picklable
Args:
filenames (list): the files to extract the variables from.
currently supported: hdf
definitions (list): where to find the data in the files. They usually
have some tree-like structure, so this a list
of leaf-value pairs. If there is more than one
all of them will be tried. (As it might be that
in some files a different naming scheme was used)
Example: [("hello_reoncstruction", "x"), ("hello_reoncstruction", "y")] ]
Keyword Args:
transformation (func): After the data is read out from the files,
transformation will be applied, e.g. the log
to the energy.
fill_empty (bool): Fill empty fields with zeros
nevents (int): ROOT only - read out only nevents from the files
reduce_dimension (str): ROOT only - multidimensional data can be reduced by only
using the index given by reduce_dimension.
E.g. in case of a TVector3, and we want to have onlz
x, that would be 0, y -> 1 and z -> 2.
dtype (np.dtype) : datatype to cast to (default np.float64, but can be used
to reduce memory footprint.
Returns:
pd.Series or pd.DataFrame
"""
nevents = kwargs["nevents"] if "nevents" in kwargs else None
fill_empty = kwargs["fill_empty"] if "fill_empty" in kwargs else False
reduce_dimension = kwargs["reduce_dimension"] if "reduce_dimension" in kwargs else None
transform = kwargs["transformation"] if "transformation" in kwargs else None
dtype = kwargs["dtype"] if "dtype" in kwargs else np.float64
concattable = True
data = pd.Series(dtype=dtype)
#multidim_data = pd.DataFrame()
for filename in filenames:
filetype = f.strip_all_endings(filename)[1]
assert filetype in REGISTERED_FILEEXTENSIONS, "Filetype {} not known!".format(filetype)
assert os.path.exists(filename), "File {} does not exist!".format(filetype)
if (filetype == ".h5") and (transform is not None):
Logger.critical("Can not apply direct transformation for h5 files (yet). This is only important for root files and varaibles which are used as VariableRole.PARAMETER")
Logger.debug("Attempting to harvest {1} file {0}".format(filename,filetype))
if filetype == ".h5" and not isinstance(filename, tables.table.Table):
# store = pd.HDFStore(filename)
hdftable = tables.open_file(filename)
else:
hdftable = filename
tmpdata = pd.Series(dtype=dtype)
for definition in definitions:
definition = list(definition)
if filetype == ".h5":
if not definition[0].startswith("/"):
definition[0] = "/" + definition[0]
try:
# data = store.select_column(*definition)
if not definition[1]:
tmpdata = hdftable.get_node(definition[0])
else:
tmpdata = hdftable.get_node(definition[0]).col(definition[1])
if tmpdata.ndim == 2:
if data.empty:
data = pd.DataFrame()
tmpdata = pd.DataFrame(tmpdata, dtype=dtype)
else:
tmpdata = pd.Series(tmpdata, dtype=dtype)
Logger.debug("Found {} entries in table for {}{}".format(len(tmpdata),definition[0],definition[1]))
break
except tables.NoSuchNodeError:
Logger.debug("Can not find definition {0} in {1}! ".format(definition, filename))
continue
elif filetype == ".root":
tmpdata, concattable = extract_from_root(filename, definitions,
nevents=nevents,
dtype=dtype,
transform=transform,
reduce_dimension=reduce_dimension)
if filetype == ".h5":
hdftable.close()
#tmpdata = harvest_single_file(filename, filetype,definitions)
# self.data = self.data.append(data.map(self.transform))
# concat should be much faster
if not True in [isinstance(tmpdata, k) for k in [pd.Series, pd.DataFrame] ]:
concattable = False
if not concattable:
Logger.warning(f"Data {definitions} can not be concatted, keep that in mind!")
try:
tmpdata = pd.Series(tmpdata)
#return tmpdata
except:
tmpdata = [k for k in tmpdata]
tmpdata = pd.Series(tmpdata)
#return tmpdata
data = pd.concat([data, tmpdata])
del tmpdata
return data
################################################################
[docs]def freedman_diaconis_bins(data,leftedge,\
rightedge,minbins=20,\
maxbins=70,fallbackbins=DEFAULT_BINS):
"""
Get a number of bins for a histogram
following Freedman/Diaconis
Args:
leftedge (float): left bin edge
rightedge (float): right bin edge
minbins (int): the minimum number of bins
maxbins (int): the maximum number of bins
fallbackbins (int): a number of bins which is returned
if calculation failse
Returns:
nbins (int): number of bins, minbins < bins < maxbins
"""
try:
finite_data = np.isfinite(data)
q3 = np.percentile(data[finite_data],75)
q1 = np.percentile(data[finite_data],25)
n_data = len(data)
if q3 == q1:
Logger.warning("Can not calculate bins, falling back... to min max approach")
q3 = max(finite_data)
q1 = min(finite_data)
h = (2*(q3-q1))/(n_data**1./3)
bins = (rightedge - leftedge)/h
if not np.isfinite(bins):
Logger.info(f"Got some nan somewhere: q1 : {q1}, q3 : {q3}, n_data : {n_data}, h : {h}")
Logger.warning("Calculate Freedman-Draconis bins failed, calculated nan bins, returning fallback")
bins = fallbackbins
if bins < minbins:
bins = minbins
if bins > maxbins:
bins = maxbins
except Exception as e:
Logger.warning(f"Calculate Freedman-Draconis bins failed {e}")
bins = fallbackbins
return bins
#############################################################
[docs]class VariableRole(enum.Enum):
"""
Define roles for variables. Some variables used in a special context (like weights)
are easily recognizable by this flag.
"""
UNKNOWN = 0
SCALAR = 10
ARRAY = 20
GENERATORWEIGHT = 30
RUNID = 40
EVENTID = 50
STARTIME = 60
ENDTIME = 70
FLUXWEIGHT = 80
PARAMETER = 90 # a single parameter, no array whatsoever
##############################################################
[docs]class AbstractBaseVariable(metaclass=abc.ABCMeta):
"""
Read out tagged numerical data from files
"""
_harvested = False
_bins = None
ROLES = VariableRole
def __hash__(self):
return hash(self.name)
def __repr__(self):
return """<Variable: {}>""".format(self.name)
def __eq__(self,other):
return self.name == other.name
def __lt__(self, other):
return sorted([self.name,other.name])[0] == self.name
def __gt__(self, other):
return sorted([self.name,other.name])[1] == self.name
def __le__(self, other):
return self < other or self == other
def __ge__(self, other):
return self > other or self == other
[docs] def declare_harvested(self):
self._harvested = True
@property
def harvested(self):
return self._harvested
@property
def bins(self):
if self._bins is None:
return self.calculate_fd_bins()
else:
return self._bins
@bins.setter
def bins(self, value):
self._bins = value
[docs] def calculate_fd_bins(self, cutmask=None):
"""
Calculate a reasonable binning
Keyword Args:
cutmask (numpy.ndarray) : a boolean mask to cut on, in case
cuts have been applied to the
category this data is part of
Returns:
numpy.ndarray: Freedman Diaconis bins
"""
tmpdata = self.data
if cutmask is not None:
if len(cutmask) > 0:
tmpdata = tmpdata[cutmask]
nbins = freedman_diaconis_bins(tmpdata, min(tmpdata), max(tmpdata))
bins = np.linspace(min(tmpdata),max(tmpdata), nbins)
return bins
[docs] def harvest(self, *files):
"""
Hook to the harvest method. Don't use in case of multiprocessing!
Args:
*files: walk through these files and readout
"""
if self.role == VariableRole.PARAMETER:
self._data = harvest(files, self.definitions, transformation= self.transform)
self._data = self._data[0]
else:
self._data = harvest(files, self.definitions)
self.declare_harvested()
[docs] @abc.abstractmethod
def rewire_variables(self, vardict):
return
@property
def ndim(self):
if hasattr(self._data, "multidim"):
if self._data.multidim == True:
return 2
return self._data.ndim
@property
def data(self):
if isinstance(self._data, pd.DataFrame):
#return self._data.as_matrix()
#FIXME: as_matrix is depracted in favor of values
return self._data.values
if not hasattr(self._data, "shape"):
Logger.warning("Something's wrong, this should be array data!")
Logger.warning(f"Seeeing {type(self._data)} data")
Logger.warning("Attempting to fix!")
self._data = np.asarray(self._data)
return self._data
############################################
[docs]class Variable(AbstractBaseVariable):
"""
A hook to a single variable read out from a file
"""
def __init__(self, name, definitions=None,\
bins=None, label="", transform=None,
role=VariableRole.SCALAR,
nevents=None,
reduce_dimension=None):
"""
Args:
name (str) : An unique identifier
Keyword Args:
definitions (list) : table and/or column names in underlying data
bins (numpy.ndarray) : used for histograms
label (str) : used for plotting and as a label in tables
transform (func) : apply to each member of the underlying data at readout
role (HErmes.selection.variables.VariableRole) : The role the variable is playing.
In most cases the default is the best choice
nevents (int) : number of events to read in (ROOT only right now!)
reduce_dimension (int) : in case of multidimensionality,
take only the the given index of the array (ROOT only right now)
"""
AbstractBaseVariable.__init__(self)
if definitions is not None:
#assert not (False in [len(x) <= 2 for x in definitions]), "Can not understand variable definitions {}!".format(definitions)
self.defsize = len(definitions[0])
#FIXME : not sure how important this is right now
#assert not (False in [len(x) == self.defsize for x in definitions]), "All definitions must have the same length!"
else:
self.defsize = 0
self.name = name
self.role = role
self.bins = bins # when histogrammed
self.label = label
self.transform = transform
self.definitions = definitions
self._data = pd.Series(dtype=np.float64)
self.nevents = nevents
self.reduce_dimension = reduce_dimension
self._role = role
#if self.defsize == 1:
# self.data = pd.DataFrame()
#if self.defsize == 2:
# self.data = pd.Series()
[docs] def rewire_variables(self, vardict):
"""
Make sure all the variables are connected properly. This is
only needed for combined/compound variables
Returns:
None
"""
pass
##########################################################
[docs]class CompoundVariable(AbstractBaseVariable):
"""
Calculate a variable from other variables. This kind of variable will not read any file.
"""
def __init__(self, name, variables=None, label="",\
bins=None, operation=lambda x,y : x + y,
role=VariableRole.SCALAR,
dtype=np.float64):
"""
A compound variable is a variable which is created from two or more other variables. This variable does not have
a direct representation in a file, but gets calculated on the fly instead, e.g. a residual of two other variables
The 'operation' denoted function here defines what operator should be applied to the variables to create the new
coumpound variable
Args:
name (str) : An unique identifier for the new variable.
Keyword Args:
variables (list) : A list of variables used to calculate the new variable.
label (str) : A label for plotting.
bins (np.ndarray) : binning for distributions.
operation (fnc) : The operation which will be applied to variables.
role (HErmes.selection.variables.VariableRole) : The role the variable is playing.
In most cases the default is the best choice. Assigning roles
to variables allows for special magic, e.g. in the case
of weighting
"""
AbstractBaseVariable.__init__(self)
self.name = name
self.role = role
self.label = label
self.bins = bins
if variables is None:
variables = []
self.variables = variables
self.operation = operation
self._data = pd.Series(dtype=np.float64) #dtype to suppress warning
self.definitions = ((self.__repr__()),)
[docs] def rewire_variables(self, vardict):
"""
Use to avoid the necessity to read out variables twice
as the variables are copied over by the categories,
the refernce is lost. Can be rewired though
"""
newvars = []
for var in self.variables:
newvars.append(vardict[var.name])
self.variables = newvars
def __repr__(self):
return """<CompoundVariable {} created from: {}>""".format(self.name,"".join([x.name for x in self.variables ]))
[docs] def harvest(self, *filenames):
#FIXME: filenames is not used, just
#there for compatibility
if self.harvested:
return
harvested = [var for var in self.variables if var.harvested]
if not len(harvested) == len(self.variables):
Logger.error("Variables have to be harvested for compound variable {0} first!".format(self.variables))
Logger.error("Only {} is harvested".format(harvested))
return
self._data = self.operation(*[var.data for var in self.variables])
self.declare_harvested()
##########################################################
[docs]class VariableList(AbstractBaseVariable):
"""
A list of variable. Can not be read out from files.
"""
def __init__(self, name, variables=None, label="", bins=None, role=VariableRole.SCALAR):
"""
Args:
name (str): An unique identifier for the new category.
Keyword Args:
variables (list): A list of variables used to calculate the new variable.
label (str): A label for plotting.
bins (np.ndarray): binning for distributions.
role (HErmes.selection.variables.VariableRole): The role the variable is playing. In most cases the default is the best choice
"""
AbstractBaseVariable.__init__(self)
self.name = name
self.label = label
self.bins = bins
if variables is None:
variables = []
self.variables = variables
[docs] def harvest(self, *filenames):
#FIXME: filenames is not used, just
#there for compatibility
# do not calculate weights yet!
if self.harvested:
return
harvested = [var for var in self.variables if var.harvested]
if not len(harvested) == len(self.variables):
Logger.error("Variables have to be harvested for compound variable {} first!".format(self.name))
return
self.declare_harvested()
[docs] def rewire_variables(self, vardict):
"""
Use to avoid the necessity to read out variables twice
as the variables are copied over by the categories,
the refernce is lost. Can be rewired though
"""
newvars = []
for var in self.variables:
newvars.append(vardict[var.name])
self.variables = newvars
@property
def data(self):
return [x.data for x in self.variables]