""" Lowess testing suite. Expected outcomes are generated by R's lowess function given the same arguments. The R script test_lowess_r_outputs.R can be used to generate the expected outcomes. The delta tests utilize Silverman's motorcycle collision data, available in R's MASS package. """ import os import numpy as np from numpy.testing import ( assert_, assert_allclose, assert_almost_equal, assert_equal, assert_raises, ) import pytest from statsmodels.nonparametric.smoothers_lowess import lowess # Number of decimals to test equality with. # The default is 7. curdir = os.path.dirname(os.path.abspath(__file__)) rpath = os.path.join(curdir, "results") class TestLowess(object): def test_import(self): # this does not work # from statsmodels.api.nonparametric import lowess as lowess1 import statsmodels.api as sm lowess1 = sm.nonparametric.lowess assert_(lowess is lowess1) def test_flat(self): test_data = { "x": np.arange(20), "y": np.zeros(20), "out": np.zeros(20), } expected_lowess = np.array([test_data["x"], test_data["out"]]).T actual_lowess = lowess(test_data["y"], test_data["x"]) assert_almost_equal(expected_lowess, actual_lowess, 7) def test_range(self): test_data = { "x": np.arange(20), "y": np.arange(20), "out": np.arange(20), } expected_lowess = np.array([test_data["x"], test_data["out"]]).T actual_lowess = lowess(test_data["y"], test_data["x"]) assert_almost_equal(expected_lowess, actual_lowess, 7) @staticmethod def generate(name, fname, x="x", y="y", out="out", kwargs=None, decimal=7): kwargs = {} if kwargs is None else kwargs data = np.genfromtxt( os.path.join(rpath, fname), delimiter=",", names=True ) assert_almost_equal.description = name if callable(kwargs): kwargs = kwargs(data) result = lowess(data[y], data[x], **kwargs) expect = np.array([data[x], data[out]]).T assert_almost_equal(result, expect, decimal) # TODO: Refactor as parametrized test once nose is permanently dropped def test_simple(self): self.generate("test_simple", "test_lowess_simple.csv") def test_iter_0(self): self.generate( "test_iter_0", "test_lowess_iter.csv", out="out_0", kwargs={"it": 0}, ) def test_iter_0_3(self): self.generate( "test_iter_0", "test_lowess_iter.csv", out="out_3", kwargs={"it": 3}, ) def test_frac_2_3(self): self.generate( "test_frac_2_3", "test_lowess_frac.csv", out="out_2_3", kwargs={"frac": 2.0 / 3}, ) def test_frac_1_5(self): self.generate( "test_frac_1_5", "test_lowess_frac.csv", out="out_1_5", kwargs={"frac": 1.0 / 5}, ) def test_delta_0(self): self.generate( "test_delta_0", "test_lowess_delta.csv", out="out_0", kwargs={"frac": 0.1}, ) def test_delta_rdef(self): self.generate( "test_delta_Rdef", "test_lowess_delta.csv", out="out_Rdef", kwargs=lambda data: { "frac": 0.1, "delta": 0.01 * np.ptp(data["x"]), }, ) def test_delta_1(self): self.generate( "test_delta_1", "test_lowess_delta.csv", out="out_1", kwargs={"frac": 0.1, "delta": 1 + 1e-10}, decimal=10, ) def test_options(self): rfile = os.path.join(rpath, "test_lowess_simple.csv") test_data = np.genfromtxt(open(rfile, "rb"), delimiter=",", names=True) y, x = test_data["y"], test_data["x"] res1_fitted = test_data["out"] expected_lowess = np.array([test_data["x"], test_data["out"]]).T # check skip sorting actual_lowess1 = lowess(y, x, is_sorted=True) assert_almost_equal(actual_lowess1, expected_lowess, decimal=13) # check skip missing actual_lowess = lowess(y, x, is_sorted=True, missing="none") assert_almost_equal(actual_lowess, actual_lowess1, decimal=13) # check order/index, returns yfitted only actual_lowess = lowess(y[::-1], x[::-1], return_sorted=False) assert_almost_equal(actual_lowess, actual_lowess1[::-1, 1], decimal=13) # check returns yfitted only actual_lowess = lowess( y, x, return_sorted=False, missing="none", is_sorted=True ) assert_almost_equal(actual_lowess, actual_lowess1[:, 1], decimal=13) # check integer input actual_lowess = lowess(np.round(y).astype(int), x, is_sorted=True) actual_lowess1 = lowess(np.round(y), x, is_sorted=True) assert_almost_equal(actual_lowess, actual_lowess1, decimal=13) assert_(actual_lowess.dtype is np.dtype(float)) # this will also have duplicate x actual_lowess = lowess(y, np.round(x).astype(int), is_sorted=True) actual_lowess1 = lowess(y, np.round(x), is_sorted=True) assert_almost_equal(actual_lowess, actual_lowess1, decimal=13) assert_(actual_lowess.dtype is np.dtype(float)) # Test specifying xvals explicitly perm_idx = np.arange(len(x) // 2) np.random.shuffle(perm_idx) actual_lowess2 = lowess(y, x, xvals=x[perm_idx], return_sorted=False) assert_almost_equal( actual_lowess[perm_idx, 1], actual_lowess2, decimal=13 ) # check with nans, this changes the arrays y[[5, 6]] = np.nan x[3] = np.nan mask_valid = np.isfinite(x) & np.isfinite(y) # actual_lowess1[[3, 5, 6], 1] = np.nan actual_lowess = lowess(y, x, is_sorted=True) actual_lowess1 = lowess(y[mask_valid], x[mask_valid], is_sorted=True) assert_almost_equal(actual_lowess, actual_lowess1, decimal=13) assert_raises(ValueError, lowess, y, x, missing="raise") perm_idx = np.arange(len(x)) np.random.shuffle(perm_idx) yperm = y[perm_idx] xperm = x[perm_idx] actual_lowess2 = lowess(yperm, xperm, is_sorted=False) assert_almost_equal(actual_lowess, actual_lowess2, decimal=13) actual_lowess3 = lowess( yperm, xperm, is_sorted=False, return_sorted=False ) mask_valid = np.isfinite(xperm) & np.isfinite(yperm) assert_equal(np.isnan(actual_lowess3), ~mask_valid) # get valid sorted smoothed y from actual_lowess3 sort_idx = np.argsort(xperm) yhat = actual_lowess3[sort_idx] yhat = yhat[np.isfinite(yhat)] assert_almost_equal(yhat, actual_lowess2[:, 1], decimal=13) # Test specifying xvals explicitly, now with nans perm_idx = np.arange(actual_lowess.shape[0]) actual_lowess4 = lowess( y, x, xvals=actual_lowess[perm_idx, 0], return_sorted=False ) assert_almost_equal( actual_lowess[perm_idx, 1], actual_lowess4, decimal=13 ) def test_duplicate_xs(self): # see 2449 # Generate cases with many duplicate x values x = [0] + [1] * 100 + [2] * 100 + [3] y = x + np.random.normal(size=len(x)) * 1e-8 result = lowess(y, x, frac=50 / len(x), it=1) # fit values should be approximately averages of values at # a particular fit, which in this case are just equal to x assert_almost_equal(result[1:-1, 1], x[1:-1], decimal=7) def test_spike(self): # see 7700 # Create a curve that is easy to fit at first but gets # harder further along. # This used to give an outlier bad fit at position 961 x = np.linspace(0, 10, 1001) y = np.cos(x ** 2 / 5) result = lowess(y, x, frac=11 / len(x), it=1) assert_(np.all(result[:, 1] > np.min(y) - 0.1)) assert_(np.all(result[:, 1] < np.max(y) + 0.1)) def test_exog_predict(self): rfile = os.path.join(rpath, "test_lowess_simple.csv") test_data = np.genfromtxt(open(rfile, "rb"), delimiter=",", names=True) y, x = test_data["y"], test_data["x"] target = lowess(y, x, is_sorted=True) # Test specifying exog_predict explicitly perm_idx = np.arange(len(x) // 2) np.random.shuffle(perm_idx) actual_lowess = lowess(y, x, xvals=x[perm_idx], missing="none") assert_almost_equal(target[perm_idx, 1], actual_lowess, decimal=13) target_it0 = lowess(y, x, return_sorted=False, it=0) actual_lowess2 = lowess(y, x, xvals=x[perm_idx], it=0) assert_almost_equal(target_it0[perm_idx], actual_lowess2, decimal=13) # Check nans in exog_predict with pytest.raises(ValueError): lowess(y, x, xvals=np.array([np.nan, 5, 3]), missing="raise") # With is_sorted=True actual_lowess3 = lowess(y, x, xvals=x, is_sorted=True) assert_equal(actual_lowess3, target[:, 1]) # check with nans, this changes the arrays y[[5, 6]] = np.nan x[3] = np.nan target = lowess(y, x, is_sorted=True) # Test specifying exog_predict explicitly, now with nans perm_idx = np.arange(target.shape[0]) actual_lowess1 = lowess(y, x, xvals=target[perm_idx, 0]) assert_almost_equal(target[perm_idx, 1], actual_lowess1, decimal=13) # nans and missing='drop' actual_lowess2 = lowess(y, x, xvals=x, missing="drop") all_finite = np.isfinite(x) & np.isfinite(y) assert_equal(actual_lowess2[all_finite], target[:, 1]) # Dimensional check with pytest.raises(ValueError): lowess(y, x, xvals=np.array([[5], [10]])) def test_returns_inputs(): # see 1960 y = [0] * 10 + [1] * 10 x = np.arange(20) result = lowess(y, x, frac=0.4) assert_almost_equal(result, np.column_stack((x, y))) def test_xvals_dtype(reset_randomstate): y = [0] * 10 + [1] * 10 x = np.arange(20) # Previously raised ValueError: Buffer dtype mismatch results_xvals = lowess(y, x, frac=0.4, xvals=x[:5]) assert_allclose(results_xvals, np.zeros(5), atol=1e-12)