"""
Active Learning Scan to find a decision boundary using Gaussian Processes
Entropy gain taken from https://github.com/diana-hep/excursion
by Lukas Heinrich,Gilles Louppe, Kyle Cranmer
(unfortunately that code does not work any more)
Adapted by Mark Goodsell to remove integration over fixed grids and replace by integration using \f[p(c | z)\f]
Plus smaller choice of points to sample (both save a lot of time)
Plus some improvement of robustness
Settings
--------
.. code-block:: text
Setup: {
"Type": "ContourGP",
"Points": default 60, int -> number of points to check
"Initial Points": default 5, int -> number of initial random points to use for initialisation
"Noise" : default None, float -> signal noise to include in the Gaussian Process
"Grid Size": default 100, int -> Number of points to include in the integration grids
"Sample Size": default 50, int -> Number of Samples to check for the maximum entropy gain
"Sample Multiplier": default 5, int -> We select the sample from this number * the sample size of random points, where the
ones to be checked are those closest in value to the decision boundary
"Function": if present, this specifies a function of variables and/or observables to solve for f(x) = threshold (see below)
if not present, we construct a likelihood function from the observables in the usual way.
One very useful option is to set "SCALING": "USER" in the observable if it is e.g. signal strength or signal strength -1.
"Threshold": default 0.0, float -> value to subtract from the above function so that the contour we are looking for solves
f(x) = 0
"Fail Value": default 10.0, float -> value to assign to failed points
"Run Name": string -> name of the run
}
"""
__meta__ = {
"name": "ContourGP",
"requires": ["scikit-learn", "numpy", "pandas"],
"settings": {
"Points": "Int",
"Initial Points": "Int",
"Noise": "Float",
"Grid Size": "Int",
"Sample Size": "Int",
"Sample Multiplier": "Int",
"Input_CSV_File": "Path",
"Fail Value": "Float",
"Threshold": "Float",
"Function": "String"
}
}
import os,sys
from bsmart.core import Scan as Scan
import itertools
import numpy as np
import math
from bsmart import debug
from bsmart import BSMutil
from bsmart.BSMlikelihood import MakeLikelihoods
import sklearn
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, WhiteKernel
#import torch
import numpy as np
from bsmart.HEPRun import UpdateProgress, DataPoint
from scipy.linalg import cho_solve
from scipy.stats import norm
## Get sklearn warnings to shut up
[docs]
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
[docs]
def getnewcandidates(GP,ndim=2,npoints=100,multiplier=6):
"""
get a bunch of test candidates selected from those near to the boundary
"""
allpoints=[]
allyfuncs=[]
for _ in range(npoints*multiplier):
point=[np.random.uniform(0.0,1.0) for _ in range(ndim)]
mu =GP.predict([point],return_std = False)
allpoints.append(point)
#allyfuncs.append(mu[0]*(1.0-mu[0]))
allyfuncs.append(abs(mu[0]))
allpoints=np.array(allpoints)
allyfuncs=np.array(allyfuncs)
return allpoints[np.argsort(allyfuncs)[:npoints]]
#return allpoints[np.argsort(allyfuncs)[-npoints:]]
# wrap -inf in float for the documentation
[docs]
def getgrids(GP,ndim=2,npoints=50,thresholds=[float('-inf'),0.0,float('inf')]):
"""
get grids via shooting algorithm according to p(c | z)
"""
def getpcj(point,j):
"""
probability of being in each category for a given point.
Assuming GP evaluation is expensive, we could test each category for each point.
However, it is probably better to have separate points for each category, so I do that.
"""
#cjs=[]
mu, std =GP.predict([point],return_std = True)
#print(pred)
#
#for j in range(len(thresholds) - 1):
alpha_j = (thresholds[j] - mu[0]) / std[0]
beta_j = (thresholds[j+1] - mu[0]) / std[0]
pcj = norm.cdf(beta_j) - norm.cdf(alpha_j)
return pcj
grids=[]
for region in range(len(thresholds) - 1):
thisgrid=[]
alldata=[]
for _ in range(npoints):
point=[np.random.uniform(0.0,1.0) for _ in range(ndim)]
pcj=getpcj(point,region)
alldata.append([point,pcj])
# now get the maximum
maxpcj=max([x[1] for x in alldata])
if maxpcj < 1e-3:
maxpcj = 1.0
for pt in alldata:
if np.random.uniform(0.0,1.0) < pt[1]/maxpcj:
thisgrid.append(pt)
tries=0
while len(thisgrid) < npoints and tries < 100*npoints:
tries=tries+1
point=[np.random.uniform(0.0,1.0) for _ in range(ndim)]
pcj=getpcj(point,region)
if pcj > maxpcj: # refactor all points
newgrid=[]
for pt in thisgrid:
if np.random.uniform(0.0,1.0) < maxpcj/pcj:
newgrid.append(pt)
thisgrid=newgrid
maxpcj=pcj
thisgrid.append([point,pcj])
else:
if np.random.uniform(0.0,1.0) < pcj/maxpcj:
thisgrid.append([point,pcj])
grids.append(np.array([x[0] for x in thisgrid])) # only keep the points, not the probabilities
return grids
[docs]
def approx_mi_vec(mu, cov, threshold_start,threshold_end):
std_sx = []
#for j in range(len(thresholds) - 1):
if True:
mu1 = mu[:, 0]
std1 = cov[:, 0, 0] ** 0.5
mu2 = mu[:, 1]
std2 = cov[:, 1, 1] ** 0.5
rho = cov[:, 0, 1] / (std1 * std2)
alpha_j = (threshold_start - mu2) / std2
beta_j = (threshold_end - mu2) / std2
c_j = norm.cdf(beta_j) - norm.cdf(alpha_j)
# \sigma(Y(X)|S(x')=j)
npdfb=norm.pdf(beta_j)
npdfa=norm.pdf(alpha_j)
npdfb[~np.isfinite(beta_j)]=0.0
npdfa[~np.isfinite(alpha_j)]=0.0
beta_j[~np.isfinite(beta_j)] = 0.0
alpha_j[~np.isfinite(alpha_j)] = 0.0
b_phi_b = beta_j * npdfb
a_phi_a = alpha_j * npdfa
deltaab=npdfb-npdfa
c_j[ c_j ==0.0] = 1.0 # always multiplied by 0
mu_cond = mu1 - std1 * rho / c_j * deltaab
var_cond = (mu1 ** 2 - 2 * mu1 * std1 * (rho / c_j * deltaab) +
std1 ** 2 * (1. - (rho ** 2 / c_j) * (b_phi_b - a_phi_a)) -
mu_cond ** 2)
std_sx_j = var_cond ** 0.5
#std_sx.append(std_sx_j)
"""
b_phi_b = beta_j * norm.pdf(beta_j)
b_phi_b[~np.isfinite(beta_j)] = 0.0
a_phi_a = alpha_j * norm.pdf(alpha_j)
a_phi_a[~np.isfinite(alpha_j)] = 0.0
mu_cond = mu1 - std1 * rho / c_j * (norm.pdf(beta_j) - norm.pdf(alpha_j))
var_cond = (mu1 ** 2 - 2 * mu1 * std1 * (rho / c_j * (norm.pdf(beta_j) - norm.pdf(alpha_j))) +
std1 ** 2 * (1. - (rho ** 2 / c_j) * (b_phi_b - a_phi_a)) -
mu_cond ** 2)
std_sx_j = var_cond ** 0.5
std_sx.append(std_sx_j)
"""
# Entropy
CONSTANT = (2 * np.e * np.pi) ** 0.5
logconst=np.log(CONSTANT)
#for j in range(len(thresholds) - 1):
mu1 = mu[:, 0]
std1 = cov[:, 0, 0] ** 0.5
mu2 = mu[:, 1]
std2 = cov[:, 1, 1] ** 0.5
h = np.log(std1 * CONSTANT)
p_j = norm(mu2, std2).cdf(threshold_end) - norm(mu2, std2).cdf(threshold_start)
#dec = p_j * np.log(std_sx[j] * CONSTANT)
dec = p_j * (np.log(std_sx_j)+logconst)
h[p_j > 0.0] -= dec[p_j > 0.0]
return h
[docs]
def info_gain(x_candidate, gp, thresholds, meanXs,noise=0.0): ## don't bother including noise in info gain
info_total=0.0
for j in range(len(thresholds) - 1):
meanX=meanXs[j]
n_samples = len(meanX)
X_all = np.concatenate([np.array([x_candidate]), meanX]).reshape(1 + n_samples, -1)
K_trans_all = gp.kernel_(X_all, gp.X_train_) ## off-diagonal -> no noise
y_mean_all = K_trans_all.dot(gp.alpha_) + gp._y_train_mean
v_all = cho_solve((gp.L_, True), K_trans_all.T)
mus = np.zeros((n_samples, 2))
mus[:, 0] = y_mean_all[0]
mus[:, 1] = y_mean_all[1:]
covs = np.zeros((n_samples, 2, 2))
c = gp.kernel_(X_all[:1], X_all) # 0:1 means just the 0th entry, but keep same number of indices
#-> x_candidate
covs[:, 0, 0] = c[0, 0]+noise
covs[:, 1, 1] = c[0, 0]+noise # all the diagonal entries are identical because they are zero distance from themselves!
#covs[:, 1, 1] = c2[:]+noise
covs[:, 0, 1] = c[0, 1:]
covs[:, 1, 0] = c[0, 1:]
x_train_len = len(gp.X_train_)
K_trans_all_repack = np.zeros((n_samples, 2, x_train_len))
K_trans_all_repack[:, 0, :] = K_trans_all[0, :]
K_trans_all_repack[:, 1, :] = K_trans_all[1:]
v_all_repack = np.zeros((n_samples, x_train_len, 2))
v_all_repack[:, :, 0] = v_all[:, 0]
v_all_repack[:, :, 1] = v_all[:, 1:].T
covs -= np.einsum('...ij,...jk->...ik', K_trans_all_repack, v_all_repack)
mi = approx_mi_vec(mus, covs, thresholds[j],thresholds[j+1])
mi[~np.isfinite(mi)] = 0.0
info_total-=np.mean(mi)
return info_total
#return -np.mean(mi)
[docs]
class NewScan(Scan):
"""Scanner class for Grid Scans"""
def __init__(self, inputs, log):
Scan.__init__(self, inputs, log)
self.downscalers,self.upscalers = BSMutil.create_scalers(self.inputs,log)
self.ndim=len(self.upscalers)
if 'Points' in self.inputs['Setup']:
self.maxpoints=self.inputs['Setup']['Points']
else:
self.maxpoints=100
self.noise=self.inputs['Setup'].get('Noise',None)
self.initialpoints=int(self.inputs['Setup'].get('Initial Points',5))
self.gridsize=int(self.inputs['Setup'].get('Grid Size',100))
self.samplestocheck=int(self.inputs['Setup'].get('Sample Size',50))
self.sample_multiplier=int(self.inputs['Setup'].get('Sample Multiplier',5))
self.bootstrap_file=None
if "Input_CSV_File" in self.inputs["Setup"]:
in_file=self.inputs["Setup"]["Input_CSV_File"]
if os.path.exists(in_file):
self.bootstrap_file=in_file
self.fail_value=float(self.inputs['Setup'].get('Fail Value',10.0))
if len(self.upscalers) == 0:
log.error('You need to specify at least one variable with a RANGE!')
raise
if "Threshold" not in self.inputs["Setup"]:
self.threshold=0.0
else:
self.threshold=float(self.inputs["Setup"]["Threshold"])
self.function_mode=False
if 'Function' in self.inputs["Setup"]: ### will define the contour via some function
self.function_mode=True
self.contour_function=self.inputs["Setup"]["Function"]
else:
self.likelihoodfns=MakeLikelihoods(self.inputs['Observables'])
[docs]
def postprocess(self,Point, observables, data_point,temp_dir,log, lock=None):
""" Get the function as a product of the likelihoods """
if self.function_mode: # use a function of variables and observables to get the contour
try:
all_dict=data_point.get_all_dict()
res=float(eval(self.contour_function,all_dict))
return res
except:
return self.fail_value
else: # use likelihood functions instead
try:
lh = 1.
for v, f in zip(observables,self.likelihoodfns):
lh = lh * f(float(v))
return lh -self.threshold #-self.threshold
except:
return self.fail_value
[docs]
def get_random_point(self):
return [np.random.uniform(0,1.0)
for _ in range(self.ndim)]
[docs]
def run(self):
def optifunc(inpoints):
points=[[self.upscalers[n](x[n]) for n in range(self.ndim)] for x in inpoints]
try:
## Update progress bar for batches, not for individual points
if len(points) > 1:
outlh=self.RunManager.run_batch(points)
else:
outlh=[self.RunManager.CoreRuns[0].run_point(points[0])]
outlh = [ x if x != [] else self.fail_value for x in outlh ]
#print(outlh)
res=np.array(outlh)
return res
except:
return np.array([self.fail_value]*len(points))
mykernel=ConstantKernel() * RBF(length_scale_bounds=[0.1,0.5], length_scale=[0.1]*self.ndim)
if self.noise is not None:
self.myGP=sklearn.gaussian_process.GaussianProcessRegressor(kernel=mykernel,alpha=self.noise)
else:
self.myGP=sklearn.gaussian_process.GaussianProcessRegressor(kernel=mykernel)
## first get some random points
self.all_points_x=[]
self.all_points_y=[]
start_points=[]
if self.bootstrap_file is None:
for _ in range(self.initialpoints):
start_points.append(self.get_random_point())
self.all_points_x = np.array(self.all_points_x + start_points)
self.log.info("Running initial batch of random points")
self.all_points_y = optifunc(start_points)
else:
import pandas as pd
input_data=pd.read_csv(self.bootstrap_file)
invars=[x for x in self.inputs["Variables"].keys()]
initial_points=pd.DataFrame(input_data,columns=invars).to_numpy().tolist()
start_points=[[sf(y) for sf,y in zip(self.downscalers,point)] for point in initial_points]
self.all_points_x = np.array(start_points)
self.all_points_y=pd.DataFrame(input_data,columns=['Result']).to_numpy()[:,0]
## gridsearch for evaluation
import itertools
#self.pointsgrid=np.array([np.linspace(-0.5,0.5,10) for _ in range(self.ndim)]).transpose()
self.pointsgrid=np.array(list(itertools.product(*[np.linspace(0.0,1.0,26) for _ in range(self.ndim)])))
self.myGP=self.myGP.fit(start_points,self.all_points_y)
self.log.info("Running main loop")
while self.RunManager.nextpointID <= self.maxpoints:
newpoints=getnewcandidates(self.myGP,self.ndim,self.samplestocheck,self.sample_multiplier)
newgrids=getgrids(self.myGP,self.ndim,self.gridsize)
for ng, grid in enumerate(newgrids):
if len(grid) == 0:
newgrids[ng] = self.pointsgrid
infos=np.array([info_gain(x_candidate, self.myGP, [-np.inf,0.0,np.inf], newgrids) for x_candidate in newpoints])
bestpoint_n=np.argmin(infos)
bestpoint=np.array([list(newpoints[bestpoint_n])])
self.all_points_x = np.vstack((self.all_points_x,bestpoint))
newy=optifunc(bestpoint)
self.all_points_y = np.hstack((self.all_points_y,newy))
self.myGP=self.myGP.fit(self.all_points_x,self.all_points_y)
UpdateProgress(self.RunManager.nextpointID-1, self.maxpoints)