diff --git a/brainiak/fcma/util.py b/brainiak/fcma/util.py index de05e8748..a4b392907 100644 --- a/brainiak/fcma/util.py +++ b/brainiak/fcma/util.py @@ -29,7 +29,7 @@ ] -def _normalize_for_correlation(data, axis): +def _normalize_for_correlation(data, axis, return_nans=False): """normalize the data before computing correlation The data will be z-scored and divided by sqrt(n) @@ -42,6 +42,9 @@ def _normalize_for_correlation(data, axis): axis: int specify which dimension of the data should be normalized + return_nans: bool, default:False + If False, return zeros for NaNs; if True, return NaNs + Returns ------- data: 2D array @@ -50,13 +53,14 @@ def _normalize_for_correlation(data, axis): shape = data.shape data = zscore(data, axis=axis, ddof=0) # if zscore fails (standard deviation is zero), - # set all values to be zero - data = np.nan_to_num(data) + # optionally set all values to be zero + if not return_nans: + data = np.nan_to_num(data) data = data / math.sqrt(shape[axis]) return data -def compute_correlation(matrix1, matrix2): +def compute_correlation(matrix1, matrix2, return_nans=False): """compute correlation between two sets of variables Correlate the rows of matrix1 with the rows of matrix2. @@ -87,6 +91,9 @@ def compute_correlation(matrix1, matrix2): {\\sqrt{\\sum\\limits_{j=1}^n x_j^2-n\\bar{x}}} \\frac{(y_i-\\bar{y})}{\\sqrt{\\sum\\limits_{j=1}^n y_j^2-n\\bar{y}}}) + By default (return_nans=False), returns zeros for vectors with NaNs. + If return_nans=True, convert zeros to NaNs (np.nan) in output. + Parameters ---------- matrix1: 2D array in shape [r1, c] @@ -95,6 +102,9 @@ def compute_correlation(matrix1, matrix2): matrix2: 2D array in shape [r2, c] MUST be continuous and row-major + return_nans: bool, default:False + If False, return zeros for NaNs; if True, return NaNs + Returns ------- corr_data: 2D array in shape [r1, r2] @@ -107,8 +117,10 @@ def compute_correlation(matrix1, matrix2): if d1 != d2: raise ValueError('Dimension discrepancy') # preprocess two components - matrix1 = _normalize_for_correlation(matrix1, 1) - matrix2 = _normalize_for_correlation(matrix2, 1) + matrix1 = _normalize_for_correlation(matrix1, 1, + return_nans=return_nans) + matrix2 = _normalize_for_correlation(matrix2, 1, + return_nans=return_nans) corr_data = np.empty((r1, r2), dtype=np.float32, order='C') # blas routine is column-major blas.compute_single_matrix_multiplication('T', 'N', diff --git a/tests/fcma/test_util.py b/tests/fcma/test_util.py index 6eb7782f2..6508d49b1 100644 --- a/tests/fcma/test_util.py +++ b/tests/fcma/test_util.py @@ -39,5 +39,21 @@ def test_correlation_computation(): "correlation results between two sets") +def test_correlation_nans(): + row1 = 5 + col = 10 + row2 = 6 + mat1 = prng.rand(row1, col).astype(np.float32) + mat2 = prng.rand(row2, col).astype(np.float32) + mat1[0, 0] = np.nan + corr = compute_correlation(mat1, mat2, return_nans=False) + assert np.all(corr == 0, axis=1)[0] + assert np.sum(corr == 0) == row2 + corr = compute_correlation(mat1, mat2, return_nans=True) + assert np.all(np.isnan(corr), axis=1)[0] + assert np.sum(np.isnan(corr)) == row2 + + if __name__ == '__main__': test_correlation_computation() + test_correlation_nans()