from statsmodels.compat.pandas import ( is_float_index, is_int_index, is_numeric_dtype, ) import numbers import warnings import numpy as np from pandas import ( DatetimeIndex, Index, Period, PeriodIndex, RangeIndex, Series, Timestamp, date_range, period_range, to_datetime, ) from pandas.tseries.frequencies import to_offset from statsmodels.base.data import PandasData import statsmodels.base.model as base import statsmodels.base.wrapper as wrap from statsmodels.tools.sm_exceptions import ValueWarning _tsa_doc = """ %(model)s Parameters ---------- %(params)s dates : array_like, optional An array-like object of datetime objects. If a pandas object is given for endog or exog, it is assumed to have a DateIndex. freq : str, optional The frequency of the time-series. A Pandas offset or 'B', 'D', 'W', 'M', 'A', or 'Q'. This is optional if dates are given. %(extra_params)s %(extra_sections)s""" _model_doc = "Timeseries model base class" _generic_params = base._model_params_doc _missing_param_doc = base._missing_param_doc def get_index_loc(key, index): """ Get the location of a specific key in an index Parameters ---------- key : label The key for which to find the location if the underlying index is a DateIndex or a location if the underlying index is a RangeIndex or an Index with an integer dtype. index : pd.Index The index to search. Returns ------- loc : int The location of the key index : pd.Index The index including the key; this is a copy of the original index unless the index had to be expanded to accommodate `key`. index_was_expanded : bool Whether or not the index was expanded to accommodate `key`. Notes ----- If `key` is past the end of of the given index, and the index is either an Index with an integral dtype or a date index, this function extends the index up to and including key, and then returns the location in the new index. """ base_index = index index = base_index date_index = isinstance(base_index, (PeriodIndex, DatetimeIndex)) int_index = is_int_index(base_index) range_index = isinstance(base_index, RangeIndex) index_class = type(base_index) nobs = len(index) # Special handling for RangeIndex if range_index and isinstance(key, (int, np.integer)): # Negative indices (that lie in the Index) if key < 0 and -key <= nobs: key = nobs + key # Out-of-sample (note that we include key itself in the new index) elif key > nobs - 1: # See gh5835. Remove the except after pandas 0.25 required. try: base_index_start = base_index.start base_index_step = base_index.step except AttributeError: base_index_start = base_index._start base_index_step = base_index._step stop = base_index_start + (key + 1) * base_index_step index = RangeIndex( start=base_index_start, stop=stop, step=base_index_step ) # Special handling for integral indices if ( not range_index and int_index and not date_index and isinstance(key, (int, np.integer)) ): # Negative indices (that lie in the Index) if key < 0 and -key <= nobs: key = nobs + key # Out-of-sample (note that we include key itself in the new index) elif key > base_index[-1]: index = Index(np.arange(base_index[0], int(key + 1))) # Special handling for date indexes if date_index: # Use index type to choose creation function if index_class is DatetimeIndex: index_fn = date_range else: index_fn = period_range # Integer key (i.e. already given a location) if isinstance(key, (int, np.integer)): # Negative indices (that lie in the Index) if key < 0 and -key < nobs: key = index[nobs + key] # Out-of-sample (note that we include key itself in the new # index) elif key > len(base_index) - 1: index = index_fn( start=base_index[0], periods=int(key + 1), freq=base_index.freq, ) key = index[-1] else: key = index[key] # Other key types (i.e. string date or some datetime-like object) else: # Covert the key to the appropriate date-like object if index_class is PeriodIndex: date_key = Period(key, freq=base_index.freq) else: date_key = Timestamp(key) # Out-of-sample if date_key > base_index[-1]: # First create an index that may not always include `key` index = index_fn( start=base_index[0], end=date_key, freq=base_index.freq ) # Now make sure we include `key` if not index[-1] == date_key: index = index_fn( start=base_index[0], periods=len(index) + 1, freq=base_index.freq, ) # To avoid possible inconsistencies with `get_loc` below, # set the key directly equal to the last index location key = index[-1] # Get the location if date_index: # (note that get_loc will throw a KeyError if key is invalid) loc = index.get_loc(key) elif int_index or range_index: # For integral indices and RangeIndex, key is assumed to be the # location and not an index value (this assumption is required to # support RangeIndex) try: index[key] # We want to raise a KeyError in this case, to keep the exception # consistent across index types. # - Attempting to index with an out-of-bound location (e.g. # index[10] on an index of length 9) will raise an IndexError # (as of Pandas 0.22) # - Attemtping to index with a type that cannot be cast to integer # (e.g. a non-numeric string) will raise a ValueError if the # index is RangeIndex (otherwise will raise an IndexError) # (as of Pandas 0.22) except (IndexError, ValueError) as e: raise KeyError(str(e)) loc = key else: loc = index.get_loc(key) # Check if we now have a modified index index_was_expanded = index is not base_index # Return the index through the end of the loc / slice if isinstance(loc, slice): end = loc.stop - 1 else: end = loc return loc, index[: end + 1], index_was_expanded def get_index_label_loc(key, index, row_labels): """ Get the location of a specific key in an index or model row labels Parameters ---------- key : label The key for which to find the location if the underlying index is a DateIndex or is only being used as row labels, or a location if the underlying index is a RangeIndex or an integral index. index : pd.Index The index to search. row_labels : pd.Index Row labels to search if key not found in index Returns ------- loc : int The location of the key index : pd.Index The index including the key; this is a copy of the original index unless the index had to be expanded to accommodate `key`. index_was_expanded : bool Whether or not the index was expanded to accommodate `key`. Notes ----- This function expands on `get_index_loc` by first trying the given base index (or the model's index if the base index was not given) and then falling back to try again with the model row labels as the base index. """ try: loc, index, index_was_expanded = get_index_loc(key, index) except KeyError as e: try: if not isinstance(key, (int, np.integer)): loc = row_labels.get_loc(key) else: raise # Require scalar # Pandas may return a slice if there are multiple matching # locations that are monotonic increasing (otherwise it may # return an array of integer locations, see below). if isinstance(loc, slice): loc = loc.start if isinstance(loc, np.ndarray): # Pandas may return a mask (boolean array), for e.g.: # pd.Index(list('abcb')).get_loc('b') if loc.dtype == bool: # Return the first True value # (we know there is at least one True value if we're # here because otherwise the get_loc call would have # raised an exception) loc = np.argmax(loc) # Finally, Pandas may return an integer array of # locations that match the given value, for e.g. # pd.DatetimeIndex(['2001-02', '2001-01']).get_loc('2001') # (this appears to be slightly undocumented behavior, since # only int, slice, and mask are mentioned in docs for # pandas.Index.get_loc as of 0.23.4) else: loc = loc[0] if not isinstance(loc, numbers.Integral): raise index = row_labels[: loc + 1] index_was_expanded = False except: raise e return loc, index, index_was_expanded def get_prediction_index( start, end, nobs, base_index, index=None, silent=False, index_none=False, index_generated=None, data=None, ): """ Get the location of a specific key in an index or model row labels Parameters ---------- start : label The key at which to start prediction. Depending on the underlying model's index, may be an integer, a date (string, datetime object, pd.Timestamp, or pd.Period object), or some other object in the model's row labels. end : label The key at which to end prediction (note that this key will be *included* in prediction). Depending on the underlying model's index, may be an integer, a date (string, datetime object, pd.Timestamp, or pd.Period object), or some other object in the model's row labels. nobs : int base_index : pd.Index index : pd.Index, optional Optionally an index to associate the predicted results to. If None, an attempt is made to create an index for the predicted results from the model's index or model's row labels. silent : bool, optional Argument to silence warnings. Returns ------- start : int The index / observation location at which to begin prediction. end : int The index / observation location at which to end in-sample prediction. The maximum value for this is nobs-1. out_of_sample : int The number of observations to forecast after the end of the sample. prediction_index : pd.Index or None The index associated with the prediction results. This index covers the range [start, end + out_of_sample]. If the model has no given index and no given row labels (i.e. endog/exog is not Pandas), then this will be None. Notes ----- The arguments `start` and `end` behave differently, depending on if they are integer or not. If either is an integer, then it is assumed to refer to a *location* in the index, not to an index value. On the other hand, if it is a date string or some other type of object, then it is assumed to refer to an index *value*. In all cases, the returned `start` and `end` values refer to index *locations* (so in the former case, the given location is validated and returned whereas in the latter case a location is found that corresponds to the given index value). This difference in behavior is necessary to support `RangeIndex`. This is because integers for a RangeIndex could refer either to index values or to index locations in an ambiguous way (while for integral indices, since we have required them to be full indexes, there is no ambiguity). """ # Convert index keys (start, end) to index locations and get associated # indexes. try: start, _, start_oos = get_index_label_loc( start, base_index, data.row_labels ) except KeyError: raise KeyError( "The `start` argument could not be matched to a" " location related to the index of the data." ) if end is None: end = max(start, len(base_index) - 1) try: end, end_index, end_oos = get_index_label_loc( end, base_index, data.row_labels ) except KeyError: raise KeyError( "The `end` argument could not be matched to a" " location related to the index of the data." ) # Handle slices (if the given index keys cover more than one date) if isinstance(start, slice): start = start.start if isinstance(end, slice): end = end.stop - 1 # Get the actual index for the prediction prediction_index = end_index[start:] # Validate prediction options if end < start: raise ValueError("Prediction must have `end` after `start`.") # Handle custom prediction index # First, if we were given an index, check that it's the right size and # use it if so if index is not None: if not len(prediction_index) == len(index): raise ValueError( "Invalid `index` provided in prediction." " Must have length consistent with `start`" " and `end` arguments." ) # But if we weren't given Pandas input, this index will not be # used because the data will not be wrapped; in that case, issue # a warning if not isinstance(data, PandasData) and not silent: warnings.warn( "Because the model data (`endog`, `exog`) were" " not given as Pandas objects, the prediction" " output will be Numpy arrays, and the given" " `index` argument will only be used" " internally.", ValueWarning, stacklevel=2, ) prediction_index = Index(index) # Now, if we *do not* have a supported index, but we were given some # kind of index... elif index_generated and not index_none: # If we are in sample, and have row labels, use them if data.row_labels is not None and not (start_oos or end_oos): prediction_index = data.row_labels[start : end + 1] # Otherwise, warn the user that they will get an integral index else: if not silent: warnings.warn( "No supported index is available." " Prediction results will be given with" " an integer index beginning at `start`.", ValueWarning, stacklevel=2, ) warnings.warn( "No supported index is available. In the next" " version, calling this method in a model" " without a supported index will result in an" " exception.", DeprecationWarning, stacklevel=2, ) elif index_none: prediction_index = None # For backwards compatibility, set `predict_*` values if prediction_index is not None: data.predict_start = prediction_index[0] data.predict_end = prediction_index[-1] data.predict_dates = prediction_index else: data.predict_start = None data.predict_end = None data.predict_dates = None # Compute out-of-sample observations out_of_sample = max(end - (nobs - 1), 0) end -= out_of_sample return start, end, out_of_sample, prediction_index class TimeSeriesModel(base.LikelihoodModel): __doc__ = _tsa_doc % { "model": _model_doc, "params": _generic_params, "extra_params": _missing_param_doc, "extra_sections": "", } def __init__( self, endog, exog=None, dates=None, freq=None, missing="none", **kwargs ): super().__init__(endog, exog, missing=missing, **kwargs) # Date handling in indexes self._init_dates(dates, freq) def _init_dates(self, dates=None, freq=None): """ Initialize dates Parameters ---------- dates : array_like, optional An array like object containing dates. freq : str, tuple, datetime.timedelta, DateOffset or None, optional A frequency specification for either `dates` or the row labels from the endog / exog data. Notes ----- Creates `self._index` and related attributes. `self._index` is always a Pandas index, and it is always an integral index, DatetimeIndex, or PeriodIndex. If Pandas objects, endog / exog may have any type of index. If it is an integral index with values 0, 1, ..., nobs-1 or if it is (coerceable to) a DatetimeIndex or PeriodIndex *with an associated frequency*, then it is called a "supported" index. Otherwise it is called an "unsupported" index. Supported indexes are standardized (i.e. a list of date strings is converted to a DatetimeIndex) and the result is put in `self._index`. Unsupported indexes are ignored, and a supported integral index is generated and put in `self._index`. Warnings are issued in this case to alert the user if the returned index from some operation (e.g. forecasting) is different from the original data's index. However, whenever possible (e.g. purely in-sample prediction), the original index is returned. The benefit of supported indexes is that they allow *forecasting*, i.e. it is possible to extend them in a reasonable way. Thus every model must have an underlying supported index, even if it is just a generated integral index. """ # Get our index from `dates` if available, otherwise from whatever # Pandas index we might have retrieved from endog, exog if dates is not None: index = dates else: index = self.data.row_labels # Sanity check that we do not have a `freq` without an index if index is None and freq is not None: raise ValueError("Frequency provided without associated index.") # If an index is available, see if it is a date-based index or if it # can be coerced to one. (If it cannot we'll fall back, below, to an # internal, 0, 1, ... nobs-1 integer index for modeling purposes) inferred_freq = False if index is not None: # Try to coerce to date-based index if not isinstance(index, (DatetimeIndex, PeriodIndex)): try: # Only try to coerce non-numeric index types (string, # list of date-times, etc.) # Note that np.asarray(Float64Index([...])) yields an # object dtype array in earlier versions of Pandas (and so # will not have is_numeric_dtype == True), so explicitly # check for it here. But note also that in very early # Pandas (~0.12), Float64Index does not exist (and so the # statsmodels compat makes it an empty tuple, so in that # case also check if the first element is a float. _index = np.asarray(index) if ( is_numeric_dtype(_index) or is_float_index(index) or (isinstance(_index[0], float)) ): raise ValueError("Numeric index given") # If a non-index Pandas series was given, only keep its # values (because we must have a pd.Index type, below, and # pd.to_datetime will return a Series when passed # non-list-like objects) if isinstance(index, Series): index = index.values # All coercion is done via pd.to_datetime # Note: date coercion via pd.to_datetime does not handle # string versions of PeriodIndex objects most of the time. _index = to_datetime(index) # Older versions of Pandas can sometimes fail here and # return a numpy array - check to make sure it's an index if not isinstance(_index, Index): raise ValueError("Could not coerce to date index") index = _index except: # Only want to actually raise an exception if `dates` was # provided but cannot be coerced. If we got the index from # the row_labels, we'll just ignore it and use the integer # index below if dates is not None: raise ValueError( "Non-date index index provided to" " `dates` argument." ) # Now, if we were given, or coerced, a date-based index, make sure # it has an associated frequency if isinstance(index, (DatetimeIndex, PeriodIndex)): # If no frequency, try to get an inferred frequency if freq is None and index.freq is None: freq = index.inferred_freq # If we got an inferred frequncy, alert the user if freq is not None: inferred_freq = True if freq is not None: warnings.warn( "No frequency information was" " provided, so inferred frequency %s" " will be used." % freq, ValueWarning, stacklevel = 2, ) # Convert the passed freq to a pandas offset object if freq is not None: freq = to_offset(freq) # Now, if no frequency information is available from the index # itself or from the `freq` argument, raise an exception if freq is None and index.freq is None: # But again, only want to raise the exception if `dates` # was provided. if dates is not None: raise ValueError( "No frequency information was" " provided with date index and no" " frequency could be inferred." ) # However, if the index itself has no frequency information but # the `freq` argument is available (or was inferred), construct # a new index with an associated frequency elif freq is not None and index.freq is None: resampled_index = date_range( start=index[0], end=index[-1], freq=freq ) if not inferred_freq and not resampled_index.equals(index): raise ValueError( "The given frequency argument could" " not be matched to the given index." ) index = resampled_index # Finally, if the index itself has a frequency and there was # also a given frequency, raise an exception if they are not # equal elif ( freq is not None and not inferred_freq and not (index.freq == freq) ): raise ValueError( "The given frequency argument is" " incompatible with the given index." ) # Finally, raise an exception if we could not coerce to date-based # but we were given a frequency argument elif freq is not None: raise ValueError( "Given index could not be coerced to dates" " but `freq` argument was provided." ) # Get attributes of the index has_index = index is not None date_index = isinstance(index, (DatetimeIndex, PeriodIndex)) period_index = isinstance(index, PeriodIndex) int_index = is_int_index(index) range_index = isinstance(index, RangeIndex) has_freq = index.freq is not None if date_index else None increment = Index(range(self.endog.shape[0])) is_increment = index.equals(increment) if int_index else None if date_index: try: is_monotonic = index.is_monotonic_increasing except AttributeError: # Remove after pandas 1.5 is minimum is_monotonic = index.is_monotonic else: is_monotonic = None # Issue warnings for unsupported indexes if has_index and not (date_index or range_index or is_increment): warnings.warn( "An unsupported index was provided and will be" " ignored when e.g. forecasting.", ValueWarning, stacklevel=2, ) if date_index and not has_freq: warnings.warn( "A date index has been provided, but it has no" " associated frequency information and so will be" " ignored when e.g. forecasting.", ValueWarning, stacklevel=2, ) if date_index and not is_monotonic: warnings.warn( "A date index has been provided, but it is not" " monotonic and so will be ignored when e.g." " forecasting.", ValueWarning, stacklevel=2, ) # Construct the internal index index_generated = False valid_index = ( (date_index and has_freq and is_monotonic) or (int_index and is_increment) or range_index ) if valid_index: _index = index else: _index = increment index_generated = True self._index = _index self._index_generated = index_generated self._index_none = index is None self._index_int64 = int_index and not range_index and not date_index self._index_dates = date_index and not index_generated self._index_freq = self._index.freq if self._index_dates else None self._index_inferred_freq = inferred_freq # For backwards compatibility, set data.dates, data.freq self.data.dates = self._index if self._index_dates else None self.data.freq = self._index.freqstr if self._index_dates else None def _get_index_loc(self, key, base_index=None): """ Get the location of a specific key in an index Parameters ---------- key : label The key for which to find the location if the underlying index is a DateIndex or a location if the underlying index is a RangeIndex or an integral index. base_index : pd.Index, optional Optionally the base index to search. If None, the model's index is searched. Returns ------- loc : int The location of the key index : pd.Index The index including the key; this is a copy of the original index unless the index had to be expanded to accommodate `key`. index_was_expanded : bool Whether or not the index was expanded to accommodate `key`. Notes ----- If `key` is past the end of of the given index, and the index is either an integral index or a date index, this function extends the index up to and including key, and then returns the location in the new index. """ if base_index is None: base_index = self._index return get_index_loc(key, base_index) def _get_index_label_loc(self, key, base_index=None): """ Get the location of a specific key in an index or model row labels Parameters ---------- key : label The key for which to find the location if the underlying index is a DateIndex or is only being used as row labels, or a location if the underlying index is a RangeIndex or an integral index. base_index : pd.Index, optional Optionally the base index to search. If None, the model's index is searched. Returns ------- loc : int The location of the key index : pd.Index The index including the key; this is a copy of the original index unless the index had to be expanded to accommodate `key`. index_was_expanded : bool Whether or not the index was expanded to accommodate `key`. Notes ----- This method expands on `_get_index_loc` by first trying the given base index (or the model's index if the base index was not given) and then falling back to try again with the model row labels as the base index. """ if base_index is None: base_index = self._index return get_index_label_loc(key, base_index, self.data.row_labels) def _get_prediction_index(self, start, end, index=None, silent=False): """ Get the location of a specific key in an index or model row labels Parameters ---------- start : label The key at which to start prediction. Depending on the underlying model's index, may be an integer, a date (string, datetime object, pd.Timestamp, or pd.Period object), or some other object in the model's row labels. end : label The key at which to end prediction (note that this key will be *included* in prediction). Depending on the underlying model's index, may be an integer, a date (string, datetime object, pd.Timestamp, or pd.Period object), or some other object in the model's row labels. index : pd.Index, optional Optionally an index to associate the predicted results to. If None, an attempt is made to create an index for the predicted results from the model's index or model's row labels. silent : bool, optional Argument to silence warnings. Returns ------- start : int The index / observation location at which to begin prediction. end : int The index / observation location at which to end in-sample prediction. The maximum value for this is nobs-1. out_of_sample : int The number of observations to forecast after the end of the sample. prediction_index : pd.Index or None The index associated with the prediction results. This index covers the range [start, end + out_of_sample]. If the model has no given index and no given row labels (i.e. endog/exog is not Pandas), then this will be None. Notes ----- The arguments `start` and `end` behave differently, depending on if they are integer or not. If either is an integer, then it is assumed to refer to a *location* in the index, not to an index value. On the other hand, if it is a date string or some other type of object, then it is assumed to refer to an index *value*. In all cases, the returned `start` and `end` values refer to index *locations* (so in the former case, the given location is validated and returned whereas in the latter case a location is found that corresponds to the given index value). This difference in behavior is necessary to support `RangeIndex`. This is because integers for a RangeIndex could refer either to index values or to index locations in an ambiguous way (while for integral indices, since we have required them to be full indexes, there is no ambiguity). """ nobs = len(self.endog) return get_prediction_index( start, end, nobs, base_index=self._index, index=index, silent=silent, index_none=self._index_none, index_generated=self._index_generated, data=self.data, ) def _get_exog_names(self): return self.data.xnames def _set_exog_names(self, vals): if not isinstance(vals, list): vals = [vals] self.data.xnames = vals # overwrite with writable property for (V)AR models exog_names = property( _get_exog_names, _set_exog_names, None, "The names of the exogenous variables.", ) class TimeSeriesModelResults(base.LikelihoodModelResults): def __init__(self, model, params, normalized_cov_params, scale=1.0): self.data = model.data super().__init__(model, params, normalized_cov_params, scale) class TimeSeriesResultsWrapper(wrap.ResultsWrapper): _attrs = {} _wrap_attrs = wrap.union_dicts( base.LikelihoodResultsWrapper._wrap_attrs, _attrs ) _methods = {"predict": "dates"} _wrap_methods = wrap.union_dicts( base.LikelihoodResultsWrapper._wrap_methods, _methods ) wrap.populate_wrapper( TimeSeriesResultsWrapper, TimeSeriesModelResults # noqa:E305 )