from collections import defaultdict import glob import os import numpy as np import pandas as pd from scipy import stats from sklearn.model_selection import KFold if __name__ == "__main__": from black import FileMode, TargetVersion, format_file_contents from sklearn.linear_model import LinearRegression from sklearn.model_selection import cross_val_score PATH = os.environ.get("PSS_PATH", "..") print(f"Processing {PATH}") files = glob.glob(os.path.join(PATH, "*.npz")) groups = defaultdict(list) for f in files: keys = f.split("-") key = int(keys[2]), int(keys[4]), keys[6] == "True" if key[0] == 0: continue with np.load(f) as contents: idx = (100 * contents["percentiles"]).astype(int) s = pd.Series(contents["q"], index=idx) groups[key].append(s) final = {} quantiles = (90, 95, 99, 99.9) crit_vals = {} ordered_keys = sorted(groups.keys()) for key in ordered_keys: final[key] = pd.concat(groups[key], axis=1) cv = [] for q in quantiles: cv.append(final[key].loc[int(100 * q)].mean()) crit_vals[key] = cv df = pd.DataFrame(crit_vals).T df.index.names = ("k", "case", "I1") df.columns = quantiles for key, row in df.iterrows(): crit_vals[key] = [round(val, 7) for val in list(row)] def setup_regressors(df, low_pow=3, high_pow=3, cut=70, log=False): s = df.stack().reset_index() q = s.level_0 / 10000 y = stats.norm.ppf(q) cv = s[0] if log: cv = np.log(cv) m = np.where(s.level_0 <= df.index[cut])[0].max() reg = np.zeros((q.shape[0], 2 + low_pow + high_pow)) reg[:m, 0] = 1 for i in range(low_pow): reg[:m, i + 1] = cv[:m] ** (i + 1) w = 1 + low_pow reg[m:, w] = 1 for i in range(high_pow): reg[m:, w + i + 1] = cv[m:] ** (i + 1) return reg, y large_p = {} small_p = {} transform = {} max_stat = {} threshold = {} hp = 2 for key in final: print(key) data = final[key] score = {} lr = LinearRegression(fit_intercept=False) for lp in (2, 3): for cut in range(40, data.shape[0] - 40): for log in (True, False): cv = KFold(shuffle=True, random_state=20210903) x, y = setup_regressors(data, lp, hp, cut, log) k = (lp, hp, cut, log) score[k] = cross_val_score( lr, x, y, scoring="neg_mean_absolute_error", cv=cv ).sum() idx = pd.Series(score).idxmax() lp, hp, cut, log = idx assert log x, y = setup_regressors(data, lp, hp, cut, log) lr = lr.fit(x, y) large = lr.coef_[: 1 + lp] if lp == 2: large = np.array(large.tolist() + [0.0]) large_p[key] = large.tolist() small_p[key] = lr.coef_[1 + lp :].tolist() transform[key] = log max_stat[key] = np.inf threshold[key] = data.iloc[cut].mean() if small_p[key][2] < 0: max_stat[key] = small_p[key][1] / (-2 * small_p[key][2]) for key in large_p: large_p[key] = [round(val, 5) for val in large_p[key]] small_p[key] = [round(val, 5) for val in small_p[key]] raw_code = f""" #!/usr/bin/env python # coding: utf-8 \"\"\" Critical value polynomials and related quantities for the bounds test of Pesaran, M. H., Shin, Y., & Smith, R. J. (2001). Bounds testing approaches to the analysis of level relationships. Journal of applied econometrics, 16(3), 289-326. These were computed using 32,000,000 simulations for each key using the methodology of PSS, who only used 40,000. The asymptotic P-value response functions were computed based on the simulated value. Critical values are the point estimates for the respective quantiles. The simulation code is contained in pss.py. The output files from this function are then transformed using pss-process.py. The format of the keys are (k, case, I1) where * k is is the number of x variables included in the model (0 is an ADF) * case is 1, 2, 3, 4 or 5 and corresponds to the PSS paper * I1 is True if X contains I1 variables and False if X is stationary The parameters are for polynomials of order 3 (large) or 2 (small). stat_star is the value where the switch between large and small occurs. Stat values less then stat_star use large_p, while values above use small_p. In all cases the stat is logged prior to computing the p-value so that the p-value is 1 - Phi(c[0] + c[1] * x + c[2] * x**2 + c[3] * x**3) where x = np.log(stat) and Phi() is the normal cdf. When this the models, the polynomial is evaluated at the natural log of the test statistic and then the normal CDF of this value is computed to produce the p-value. \"\"\" __all__ = ["large_p", "small_p", "crit_vals", "crit_percentiles", "stat_star"] large_p = {large_p} small_p = {small_p} stat_star = {threshold} crit_percentiles = {quantiles} crit_vals = {crit_vals} """ targets = {TargetVersion.PY37, TargetVersion.PY38, TargetVersion.PY39} fm = FileMode(target_versions=targets, line_length=79) formatted_code = format_file_contents(raw_code, fast=False, mode=fm) with open("../pss_critical_values.py", "w", newline="\n") as out: out.write(formatted_code)