Hi,
I think it's relatively trivial to add a weighted version of this, which would be very useful to some folks (including myself).
example of an implementation below:
`def CountQuadsWeighted(Arr2D, point, weights):
""" Weighted version of CountQuads. Computes the probabilities of finding
points in each 4 quadrant defined by a vertical and horizontal lines
crossing the point, by summing the weights of points in Arr2D in each
quadrant and normalizing by the total weight sum.
:param list Arr2D: Array of points to be counted.
:param array point: A 2 element list, point, which is the center of
4 square quadrants.
:param array weights: 1D array of per-event weights with the same length
as Arr2D. Compatible with sWeights, including negative values, provided
that the sum of weights is not zero.
:returns: a tuple of 4 floats. The weighted probabilities of finding a
point in each quadrants, with point as the origin. p stands for positive,
n for negative, with the first and second positions meaning the x and y
directions respectively.
"""
if isinstance(point, list):
point = np.asarray((np.ravel(point)))
elif type(point).__module__+type(point).__name__ == 'numpyndarray':
point = np.ravel(point.copy())
else:
raise TypeError('Input point is neither list nor numpyndarray')
if len(point) != 2:
return
if isinstance(Arr2D, list):
Arr2D = np.asarray((Arr2D))
elif type(Arr2D).__module__+type(Arr2D).__name__ == 'numpyndarray':
pass
else:
raise TypeError('Input Arr2D is neither list nor numpyndarray')
if Arr2D.shape[1] > Arr2D.shape[0]: # Reshape to A[row,column]
Arr2D = Arr2D.copy().T
if Arr2D.shape[1] != 2:
raise TypeError('Input Arr2D is not 2D')
if isinstance(weights, list):
weights = np.asarray(weights)
elif type(weights).__module__+type(weights).__name__ == 'numpyndarray':
weights = np.ravel(weights.copy())
else:
raise TypeError('Input weights is neither list nor numpyndarray')
if len(weights) != Arr2D.shape[0]:
raise TypeError('Input weights length does not match Arr2D')
# The pp of Qpp refer to p for 'positive' and n for 'negative' quadrants.
# In order. first subscript is x, second is y.
Mpp = (Arr2D[:, 0] > point[0]) & (Arr2D[:, 1] > point[1])
Mnp = (Arr2D[:, 0] < point[0]) & (Arr2D[:, 1] > point[1])
Mpn = (Arr2D[:, 0] > point[0]) & (Arr2D[:, 1] < point[1])
Mnn = (Arr2D[:, 0] < point[0]) & (Arr2D[:, 1] < point[1])
# Normalized weighted fractions:
wsum = np.sum(weights)
fpp = np.sum(weights[Mpp])/wsum
fnp = np.sum(weights[Mnp])/wsum
fpn = np.sum(weights[Mpn])/wsum
fnn = np.sum(weights[Mnn])/wsum
# NOTE: as in CountQuads, the f's sum to 1.0 only up to floating-point
# error, and points sitting exactly on the dividing lines are excluded.
return(fpp, fnp, fpn, fnn)
def WeightedPearsonR(x, y, weights):
""" Weighted Pearson linear correlation coefficient. Used by the
weighted 2D KS tests in place of scipy.stats.pearsonr, which has no
native weights argument. Defined as the weighted covariance over the
geometric mean of the weighted variances.
:param array x: 1D array of x coordinates.
:param array y: 1D array of y coordinates, same length as x.
:param array weights: 1D array of weights, same length as x.
:returns: a float. The weighted Pearson correlation coefficient.
"""
wsum = np.sum(weights)
xbar = np.sum(weights*x)/wsum
ybar = np.sum(weights*y)/wsum
cov = np.sum(weights*(x-xbar)*(y-ybar))/wsum
varx = np.sum(weights*(x-xbar)**2)/wsum
vary = np.sum(weights*(y-ybar)**2)/wsum
return cov/np.sqrt(varx*vary)
def ks2d2sWeighted(Arr2D1, Arr2D2, weights1, weights2):
""" Weighted version of ks2d2s. ks stands for Kolmogorov-Smirnov, 2d for
2 dimensional, 2s for 2 samples.
KS test for goodness-of-fit on two 2D samples, each with per-event
weights. Tests the hypothesis that the two weighted samples are drawn
from the same distribution. Designed to be usable with sWeights, where
negative weights are possible.
The unweighted sample size N is replaced everywhere by the Kish
effective sample size N_eff = (sum(w))^2 / sum(w^2), and the Pearson
correlation entering the RR correction is computed with weights.
:param array Arr2D1: 2D array of points/samples.
:param array Arr2D2: 2D array of points/samples.
:param array weights1: 1D weights array, same length as Arr2D1.
:param array weights2: 1D weights array, same length as Arr2D2.
:returns: a tuple of two floats. First, the two-sample K-S statistic.
If this value is higher than the significance level of the hypothesis,
it is rejected. Second, the significance level of *d*. Small values of
prob show that the two samples are significantly different.
"""
if type(Arr2D1).__module__+type(Arr2D1).__name__ == 'numpyndarray':
pass
else:
raise TypeError('Input Arr2D1 is neither list nor numpyndarray')
if Arr2D1.shape[1] > Arr2D1.shape[0]:
Arr2D1 = Arr2D1.copy().T
if type(Arr2D2).__module__+type(Arr2D2).__name__ == 'numpyndarray':
pass
else:
raise TypeError('Input Arr2D2 is neither list nor numpyndarray')
if Arr2D2.shape[1] > Arr2D2.shape[0]:
Arr2D2 = Arr2D2.copy().T
if Arr2D1.shape[1] != 2:
raise TypeError('Input Arr2D1 is not 2D')
if Arr2D2.shape[1] != 2:
raise TypeError('Input Arr2D2 is not 2D')
if isinstance(weights1, list):
weights1 = np.asarray(weights1)
weights1 = np.ravel(weights1)
if isinstance(weights2, list):
weights2 = np.asarray(weights2)
weights2 = np.ravel(weights2)
if len(weights1) != Arr2D1.shape[0]:
raise TypeError('Input weights1 length does not match Arr2D1')
if len(weights2) != Arr2D2.shape[0]:
raise TypeError('Input weights2 length does not match Arr2D2')
d1, d2 = 0., 0.
for point1 in Arr2D1:
fpp1, fmp1, fpm1, fmm1 = CountQuadsWeighted(Arr2D1, point1, weights1)
fpp2, fmp2, fpm2, fmm2 = CountQuadsWeighted(Arr2D2, point1, weights2)
d1 = max(d1, abs(fpp1-fpp2))
d1 = max(d1, abs(fpm1-fpm2))
d1 = max(d1, abs(fmp1-fmp2))
d1 = max(d1, abs(fmm1-fmm2))
for point2 in Arr2D2:
fpp1, fmp1, fpm1, fmm1 = CountQuadsWeighted(Arr2D1, point2, weights1)
fpp2, fmp2, fpm2, fmm2 = CountQuadsWeighted(Arr2D2, point2, weights2)
d2 = max(d2, abs(fpp1-fpp2))
d2 = max(d2, abs(fpm1-fpm2))
d2 = max(d2, abs(fmp1-fmp2))
d2 = max(d2, abs(fmm1-fmm2))
d = (d1+d2)/2.
# Kish effective sample size replaces the raw N in the Peacock/Press
# prefactor. Reduces exactly to N for uniform weights.
Neff1 = np.sum(weights1)**2/np.sum(weights1**2)
Neff2 = np.sum(weights2)**2/np.sum(weights2**2)
sqen = np.sqrt(Neff1*Neff2/(Neff1+Neff2))
R1 = WeightedPearsonR(Arr2D1[:, 0], Arr2D1[:, 1], weights1)
R2 = WeightedPearsonR(Arr2D2[:, 0], Arr2D2[:, 1], weights2)
RR = np.sqrt(1.-(R1*R1+R2*R2)/2.)
prob = Qks(d*sqen/(1.+RR*(0.25-0.75/sqen)))
return(d, prob)
def ks2d1sWeighted(Arr2D, func2D, weights, xlim=[], ylim=[]):
""" Weighted version of ks2d1s. ks stands for Kolmogorov-Smirnov, 2d for
2 dimensional, 1s for 1 sample.
KS test for goodness-of-fit on one weighted 2D sample and one 2D density
distribution. Tests the hypothesis that the weighted sample was generated
from the density distribution. The density itself is unweighted, so only
Arr2D needs a weights vector.
:param array Arr2D: 2D array of points/samples.
:param func2D: Density distribution that takes 2 arguments: x and y.
:param array weights: 1D weights array, same length as Arr2D.
:param array xlim, ylim: Defines the domain for the numerical integration
necessary to compute the quadrant probabilities.
:returns: tuple of two floats. First, the K-S statistic. If this value
is higher than the significance level of the hypothesis, it is rejected.
Second, the significance level of *d*.
"""
if callable(func2D):
if len(inspect.getfullargspec(func2D)[0]) != 2:
raise TypeError('Input func2D is not a function with 2 input arguments')
pass
else:
raise TypeError('Input func2D is not a function')
if type(Arr2D).__module__+type(Arr2D).__name__ == 'numpyndarray':
pass
else:
raise TypeError('Input Arr2D is neither list nor numpyndarray')
if Arr2D.shape[1] > Arr2D.shape[0]:
Arr2D = Arr2D.copy().T
if Arr2D.shape[1] != 2:
raise TypeError('Input Arr2D is not 2D')
if isinstance(weights, list):
weights = np.asarray(weights)
weights = np.ravel(weights)
if len(weights) != Arr2D.shape[0]:
raise TypeError('Input weights length does not match Arr2D')
if xlim == []:
xlim.append(np.amin(Arr2D[:, 0]) -
abs(np.amin(Arr2D[:, 0]) -
np.amax(Arr2D[:, 0]))/10)
xlim.append(np.amax(Arr2D[:, 0]) -
abs(np.amin(Arr2D[:, 0]) -
np.amax(Arr2D[:, 0]))/10)
if ylim == []:
ylim.append(np.amin(Arr2D[:, 1]) -
abs(np.amin(Arr2D[:, 1]) -
np.amax(Arr2D[:, 1]))/10)
ylim.append(np.amax(Arr2D[:, 1]) -
abs(np.amin(Arr2D[:, 1]) -
np.amax(Arr2D[:, 1]))/10)
d = 0
for point in Arr2D:
fpp1, fmp1, fpm1, fmm1 = FuncQuads(func2D, point, xlim, ylim)
fpp2, fmp2, fpm2, fmm2 = CountQuadsWeighted(Arr2D, point, weights)
d = max(d, abs(fpp1-fpp2))
d = max(d, abs(fpm1-fpm2))
d = max(d, abs(fmp1-fmp2))
d = max(d, abs(fmm1-fmm2))
Neff = np.sum(weights)**2/np.sum(weights**2)
sqen = np.sqrt(Neff)
R1 = WeightedPearsonR(Arr2D[:, 0], Arr2D[:, 1], weights)
RR = np.sqrt(1.0-R1**2)
prob = Qks(d*sqen/(1.+RR*(0.25-0.75/sqen)))
return d, prob`
Hi,
I think it's relatively trivial to add a weighted version of this, which would be very useful to some folks (including myself).
example of an implementation below:
`def CountQuadsWeighted(Arr2D, point, weights):
""" Weighted version of CountQuads. Computes the probabilities of finding
points in each 4 quadrant defined by a vertical and horizontal lines
crossing the point, by summing the weights of points in Arr2D in each
quadrant and normalizing by the total weight sum.
def WeightedPearsonR(x, y, weights):
""" Weighted Pearson linear correlation coefficient. Used by the
weighted 2D KS tests in place of scipy.stats.pearsonr, which has no
native weights argument. Defined as the weighted covariance over the
geometric mean of the weighted variances.
def ks2d2sWeighted(Arr2D1, Arr2D2, weights1, weights2):
""" Weighted version of ks2d2s. ks stands for Kolmogorov-Smirnov, 2d for
2 dimensional, 2s for 2 samples.
KS test for goodness-of-fit on two 2D samples, each with per-event
weights. Tests the hypothesis that the two weighted samples are drawn
from the same distribution. Designed to be usable with sWeights, where
negative weights are possible.
def ks2d1sWeighted(Arr2D, func2D, weights, xlim=[], ylim=[]):
""" Weighted version of ks2d1s. ks stands for Kolmogorov-Smirnov, 2d for
2 dimensional, 1s for 1 sample.
KS test for goodness-of-fit on one weighted 2D sample and one 2D density
distribution. Tests the hypothesis that the weighted sample was generated
from the density distribution. The density itself is unweighted, so only
Arr2D needs a weights vector.