Adding weights

Hi, 

I think it's relatively trivial to add a weighted version of this, which would be very useful to some folks (including myself).

example of an implementation below:
`def CountQuadsWeighted(Arr2D, point, weights):
    """ Weighted version of CountQuads. Computes the probabilities of finding
    points in each 4 quadrant defined by a vertical and horizontal lines
    crossing the point, by summing the weights of points in Arr2D in each
    quadrant and normalizing by the total weight sum.

    :param list Arr2D: Array of points to be counted.
    :param array point: A 2 element list, point, which is the center of
    4 square quadrants.
    :param array weights: 1D array of per-event weights with the same length
    as Arr2D. Compatible with sWeights, including negative values, provided
    that the sum of weights is not zero.
    :returns: a tuple of 4 floats.  The weighted probabilities of finding a
    point in each quadrants, with point as the origin.  p stands for positive,
    n for negative, with the first and second positions meaning the x and y
    directions respectively.
    """
    if isinstance(point, list):
        point = np.asarray((np.ravel(point)))
    elif type(point).__module__+type(point).__name__ == 'numpyndarray':
        point = np.ravel(point.copy())
    else:
        raise TypeError('Input point is neither list nor numpyndarray')
    if len(point) != 2:
        return
    if isinstance(Arr2D, list):
        Arr2D = np.asarray((Arr2D))
    elif type(Arr2D).__module__+type(Arr2D).__name__ == 'numpyndarray':
        pass
    else:
        raise TypeError('Input Arr2D is neither list nor numpyndarray')
    if Arr2D.shape[1] > Arr2D.shape[0]:  # Reshape to A[row,column]
        Arr2D = Arr2D.copy().T
    if Arr2D.shape[1] != 2:
        raise TypeError('Input Arr2D is not 2D')
    if isinstance(weights, list):
        weights = np.asarray(weights)
    elif type(weights).__module__+type(weights).__name__ == 'numpyndarray':
        weights = np.ravel(weights.copy())
    else:
        raise TypeError('Input weights is neither list nor numpyndarray')
    if len(weights) != Arr2D.shape[0]:
        raise TypeError('Input weights length does not match Arr2D')
    # The pp of Qpp refer to p for 'positive' and n for 'negative' quadrants.
    # In order. first subscript is x, second is y.
    Mpp = (Arr2D[:, 0] > point[0]) & (Arr2D[:, 1] > point[1])
    Mnp = (Arr2D[:, 0] < point[0]) & (Arr2D[:, 1] > point[1])
    Mpn = (Arr2D[:, 0] > point[0]) & (Arr2D[:, 1] < point[1])
    Mnn = (Arr2D[:, 0] < point[0]) & (Arr2D[:, 1] < point[1])
    # Normalized weighted fractions:
    wsum = np.sum(weights)
    fpp = np.sum(weights[Mpp])/wsum
    fnp = np.sum(weights[Mnp])/wsum
    fpn = np.sum(weights[Mpn])/wsum
    fnn = np.sum(weights[Mnn])/wsum
    # NOTE: as in CountQuads, the f's sum to 1.0 only up to floating-point
    # error, and points sitting exactly on the dividing lines are excluded.
    return(fpp, fnp, fpn, fnn)


def WeightedPearsonR(x, y, weights):
    """ Weighted Pearson linear correlation coefficient. Used by the
    weighted 2D KS tests in place of scipy.stats.pearsonr, which has no
    native weights argument. Defined as the weighted covariance over the
    geometric mean of the weighted variances.

    :param array x: 1D array of x coordinates.
    :param array y: 1D array of y coordinates, same length as x.
    :param array weights: 1D array of weights, same length as x.
    :returns: a float. The weighted Pearson correlation coefficient.
    """
    wsum = np.sum(weights)
    xbar = np.sum(weights*x)/wsum
    ybar = np.sum(weights*y)/wsum
    cov = np.sum(weights*(x-xbar)*(y-ybar))/wsum
    varx = np.sum(weights*(x-xbar)**2)/wsum
    vary = np.sum(weights*(y-ybar)**2)/wsum
    return cov/np.sqrt(varx*vary)


def ks2d2sWeighted(Arr2D1, Arr2D2, weights1, weights2):
    """ Weighted version of ks2d2s. ks stands for Kolmogorov-Smirnov, 2d for
    2 dimensional, 2s for 2 samples.
    KS test for goodness-of-fit on two 2D samples, each with per-event
    weights. Tests the hypothesis that the two weighted samples are drawn
    from the same distribution. Designed to be usable with sWeights, where
    negative weights are possible.

    The unweighted sample size N is replaced everywhere by the Kish
    effective sample size N_eff = (sum(w))^2 / sum(w^2), and the Pearson
    correlation entering the RR correction is computed with weights.

    :param array Arr2D1: 2D array of points/samples.
    :param array Arr2D2: 2D array of points/samples.
    :param array weights1: 1D weights array, same length as Arr2D1.
    :param array weights2: 1D weights array, same length as Arr2D2.
    :returns: a tuple of two floats. First, the two-sample K-S statistic.
    If this value is higher than the significance level of the hypothesis,
    it is rejected. Second, the significance level of *d*. Small values of
    prob show that the two samples are significantly different.
    """
    if type(Arr2D1).__module__+type(Arr2D1).__name__ == 'numpyndarray':
        pass
    else:
        raise TypeError('Input Arr2D1 is neither list nor numpyndarray')
    if Arr2D1.shape[1] > Arr2D1.shape[0]:
        Arr2D1 = Arr2D1.copy().T
    if type(Arr2D2).__module__+type(Arr2D2).__name__ == 'numpyndarray':
        pass
    else:
        raise TypeError('Input Arr2D2 is neither list nor numpyndarray')
    if Arr2D2.shape[1] > Arr2D2.shape[0]:
        Arr2D2 = Arr2D2.copy().T
    if Arr2D1.shape[1] != 2:
        raise TypeError('Input Arr2D1 is not 2D')
    if Arr2D2.shape[1] != 2:
        raise TypeError('Input Arr2D2 is not 2D')
    if isinstance(weights1, list):
        weights1 = np.asarray(weights1)
    weights1 = np.ravel(weights1)
    if isinstance(weights2, list):
        weights2 = np.asarray(weights2)
    weights2 = np.ravel(weights2)
    if len(weights1) != Arr2D1.shape[0]:
        raise TypeError('Input weights1 length does not match Arr2D1')
    if len(weights2) != Arr2D2.shape[0]:
        raise TypeError('Input weights2 length does not match Arr2D2')
    d1, d2 = 0., 0.
    for point1 in Arr2D1:
        fpp1, fmp1, fpm1, fmm1 = CountQuadsWeighted(Arr2D1, point1, weights1)
        fpp2, fmp2, fpm2, fmm2 = CountQuadsWeighted(Arr2D2, point1, weights2)
        d1 = max(d1, abs(fpp1-fpp2))
        d1 = max(d1, abs(fpm1-fpm2))
        d1 = max(d1, abs(fmp1-fmp2))
        d1 = max(d1, abs(fmm1-fmm2))
    for point2 in Arr2D2:
        fpp1, fmp1, fpm1, fmm1 = CountQuadsWeighted(Arr2D1, point2, weights1)
        fpp2, fmp2, fpm2, fmm2 = CountQuadsWeighted(Arr2D2, point2, weights2)
        d2 = max(d2, abs(fpp1-fpp2))
        d2 = max(d2, abs(fpm1-fpm2))
        d2 = max(d2, abs(fmp1-fmp2))
        d2 = max(d2, abs(fmm1-fmm2))
    d = (d1+d2)/2.
    # Kish effective sample size replaces the raw N in the Peacock/Press
    # prefactor. Reduces exactly to N for uniform weights.
    Neff1 = np.sum(weights1)**2/np.sum(weights1**2)
    Neff2 = np.sum(weights2)**2/np.sum(weights2**2)
    sqen = np.sqrt(Neff1*Neff2/(Neff1+Neff2))
    R1 = WeightedPearsonR(Arr2D1[:, 0], Arr2D1[:, 1], weights1)
    R2 = WeightedPearsonR(Arr2D2[:, 0], Arr2D2[:, 1], weights2)
    RR = np.sqrt(1.-(R1*R1+R2*R2)/2.)
    prob = Qks(d*sqen/(1.+RR*(0.25-0.75/sqen)))
    return(d, prob)


def ks2d1sWeighted(Arr2D, func2D, weights, xlim=[], ylim=[]):
    """ Weighted version of ks2d1s. ks stands for Kolmogorov-Smirnov, 2d for
    2 dimensional, 1s for 1 sample.
    KS test for goodness-of-fit on one weighted 2D sample and one 2D density
    distribution. Tests the hypothesis that the weighted sample was generated
    from the density distribution. The density itself is unweighted, so only
    Arr2D needs a weights vector.

    :param array Arr2D: 2D array of points/samples.
    :param func2D: Density distribution that takes 2 arguments: x and y.
    :param array weights: 1D weights array, same length as Arr2D.
    :param array xlim, ylim: Defines the domain for the numerical integration
    necessary to compute the quadrant probabilities.
    :returns: tuple of two floats. First, the K-S statistic. If this value
    is higher than the significance level of the hypothesis, it is rejected.
    Second, the significance level of *d*.
    """
    if callable(func2D):
        if len(inspect.getfullargspec(func2D)[0]) != 2:
            raise TypeError('Input func2D is not a function with 2 input arguments')
        pass
    else:
        raise TypeError('Input func2D is not a function')
    if type(Arr2D).__module__+type(Arr2D).__name__ == 'numpyndarray':
        pass
    else:
        raise TypeError('Input Arr2D is neither list nor numpyndarray')
    if Arr2D.shape[1] > Arr2D.shape[0]:
        Arr2D = Arr2D.copy().T
    if Arr2D.shape[1] != 2:
        raise TypeError('Input Arr2D is not 2D')
    if isinstance(weights, list):
        weights = np.asarray(weights)
    weights = np.ravel(weights)
    if len(weights) != Arr2D.shape[0]:
        raise TypeError('Input weights length does not match Arr2D')
    if xlim == []:
        xlim.append(np.amin(Arr2D[:, 0]) -
                    abs(np.amin(Arr2D[:, 0]) -
                    np.amax(Arr2D[:, 0]))/10)
        xlim.append(np.amax(Arr2D[:, 0]) -
                    abs(np.amin(Arr2D[:, 0]) -
                    np.amax(Arr2D[:, 0]))/10)
    if ylim == []:
        ylim.append(np.amin(Arr2D[:, 1]) -
                    abs(np.amin(Arr2D[:, 1]) -
                    np.amax(Arr2D[:, 1]))/10)
        ylim.append(np.amax(Arr2D[:, 1]) -
                    abs(np.amin(Arr2D[:, 1]) -
                    np.amax(Arr2D[:, 1]))/10)
    d = 0
    for point in Arr2D:
        fpp1, fmp1, fpm1, fmm1 = FuncQuads(func2D, point, xlim, ylim)
        fpp2, fmp2, fpm2, fmm2 = CountQuadsWeighted(Arr2D, point, weights)
        d = max(d, abs(fpp1-fpp2))
        d = max(d, abs(fpm1-fpm2))
        d = max(d, abs(fmp1-fmp2))
        d = max(d, abs(fmm1-fmm2))
    Neff = np.sum(weights)**2/np.sum(weights**2)
    sqen = np.sqrt(Neff)
    R1 = WeightedPearsonR(Arr2D[:, 0], Arr2D[:, 1], weights)
    RR = np.sqrt(1.0-R1**2)
    prob = Qks(d*sqen/(1.+RR*(0.25-0.75/sqen)))
    return d, prob`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Adding weights #5

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Adding weights #5

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions