"""
Test Results for ordinal models from R MASS lib
"""

import numpy as np
import os
import pandas as pd
from statsmodels.tools.testing import Holder

# R (v3.4.4) code inspired from
# https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
# library(readr) # to open the file
# library(MASS) # to perform ordinal regression
#
# ## load the data, 400 rows with 3 exogs(2 binaries, 1 float)
# ##and target 3-ordinal variable
# ologit_ucla <- read_csv("ologit_ucla.csv")
# ologit_ucla$apply <- as.factor(ologit_ucla$apply)
# ologit_ucla$apply <- factor(ologit_ucla$apply,
#               levels=c("unlikely", "somewhat likely", "very likely"))
#
# ## fit ordered logit model
# r_logit <- polr(apply ~ pared + public + gpa,
#          data = ologit_ucla,
#          method = 'logit', # or 'probit'
#          Hess=TRUE)
#
# ## fit ordered probit model
# r_probit <- polr(apply ~ pared + public + gpa,
#      data = ologit_ucla,
#      method = 'probit',
#      Hess=TRUE)
#
# ## fit ordered cloglog model
# r_cloglog <- polr(apply ~ pared + public + gpa,
#          data = ologit_ucla,
#          method = 'cloglog',
#          Hess=TRUE)
#
# ## with r = r_logit or r_probit or r_cloglog
# ## we add p-values
# (ctable <- coef(summary(r)))
# p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
# (ctable <- cbind(ctable, "p value" = p))
# ## show 7 first predictions
# head(predict(r, subset(ologit_ucla,
#                        select=c("pared", "public","gpa")), type='prob'),7)

data_store = Holder()
cur_dir = os.path.dirname(os.path.abspath(__file__))
df = pd.read_csv(os.path.join(cur_dir, "ologit_ucla.csv"))

# df_unordered['apply'] is pd.Categorical with ordered = False
df_unordered = df.copy()
df_unordered['apply'] = pd.Categorical(df['apply'], ordered=False)
# but categories are set in order
df_unordered['apply'] = df_unordered['apply'].cat.set_categories(
    ['unlikely', 'somewhat likely', 'very likely'])

# df['apply'] is pd.Categorical with ordered = True
df['apply'] = pd.Categorical(df['apply'], ordered=True)
df['apply'] = df['apply'].cat.set_categories(
    ['unlikely', 'somewhat likely', 'very likely'])

data_store.df_unordered = df_unordered
data_store.df = df
data_store.nobs = 400
data_store.n_ordinal_cat = 3

res_ord_logit = Holder()
res_ord_logit.coefficients_val = \
    np.array([1.04769011, -0.05878572, 0.61594057])
res_ord_logit.coefficients_stdE = np.array([0.2658, 0.2979, 0.2606])
res_ord_logit.coefficients_tval = np.array([3.9418, -0.1974, 2.3632])
res_ord_logit.coefficients_pval = \
    np.array([8.087070e-05, 8.435464e-01, 1.811594e-02])
res_ord_logit.thresholds = np.array([2.203915, 4.299363])
res_ord_logit.prob_pred = np.array([[0.5488310, 0.3593310, 0.09183798],
                                    [0.3055632, 0.4759496, 0.21848725],
                                    [0.2293835, 0.4781951, 0.29242138],
                                    [0.6161224, 0.3126888, 0.07118879],
                                    [0.6560149, 0.2833901, 0.06059505],
                                    [0.6609240, 0.2797117, 0.05936431],
                                    [0.6518332, 0.2865114, 0.06165547]])
res_ord_logit.resid_prob = np.array(
    [+0.90816202,  0.08707593, -0.77061649,  0.54493358,  0.59541984,
     -0.33907603,  0.59017771,  0.55970937, -0.41190566,  0.01851403,
     -0.73753054,  0.53932241,  0.87609730,  0.56880356, -0.73936739,
     -0.42539653, -0.47329831, -0.29581150,  0.90792753,  0.42811409])
res_ord_logit.resid_prob_stats = [
    3.5137670974297e-06, -0.7706164931682951, 0.9434781714439548,
    0.2550630116905416]

res_ord_probit = Holder()
res_ord_probit.coefficients_val = np.array([0.59811, 0.01016, 0.35815])
res_ord_probit.coefficients_stdE = np.array([0.1579, 0.1728, 0.1568])
res_ord_probit.coefficients_tval = np.array([3.78881, 0.05878, 2.28479])
res_ord_probit.coefficients_pval = \
    np.array([1.513681e-04, 9.531256e-01, 2.232519e-02])
res_ord_probit.thresholds = np.array([1.2968, 2.5028])
res_ord_probit.prob_pred = np.array([[0.5514181, 0.3576848, 0.09089707],
                                     [0.3260107, 0.4488799, 0.22510933],
                                     [0.2349733, 0.4506351, 0.31439162],
                                     [0.6142501, 0.3184778, 0.06727214],
                                     [0.6519891, 0.2928449, 0.05516602],
                                     [0.6402204, 0.3009945, 0.05878509],
                                     [0.6480094, 0.2956162, 0.05637442]])

res_ord_cloglog = Holder()
res_ord_cloglog.coefficients_val = np.array([0.5166455, 0.1081131, 0.3343895])
res_ord_cloglog.coefficients_stdE = np.array([0.1613525, 0.1680675, 0.1542065])
res_ord_cloglog.coefficients_tval = np.array([3.2019668, 0.6432721, 2.1684534])
res_ord_cloglog.coefficients_pval = \
    np.array([1.364927e-03, 5.200475e-01, 3.012421e-02])
res_ord_cloglog.thresholds = np.array([0.8705304, 1.9744660])
res_ord_cloglog.prob_pred = np.array([[0.5519526, 0.3592524, 0.08879500],
                                      [0.3855287, 0.3842645, 0.23020682],
                                      [0.2899487, 0.3540202, 0.35603111],
                                      [0.6067184, 0.3333548, 0.05992678],
                                      [0.6411418, 0.3133969, 0.04546127],
                                      [0.5940557, 0.3400072, 0.06593710],
                                      [0.6374521, 0.3156622, 0.04688570]])