EMIS/ELibM Electronic Journals

Outdated Archival Version

These pages are not updated anymore. They reflect the state of 20 August 2005. For the current production of this journal, please refer to http://www.math.psu.edu/era/.


%_ **************************************************************************
%_ * The TeX source for AMS journal articles is the publishers TeX code     *
%_ * which may contain special commands defined for the AMS production      *
%_ * environment.  Therefore, it may not be possible to process these files *
%_ * through TeX without errors.  To display a typeset version of a journal *
%_ * article easily, we suggest that you either view the HTML version or    *
%_ * retrieve the article in DVI, PostScript, or PDF format.                *
%_ **************************************************************************
% Author Package file for use with AMS-LaTeX 1.2
\controldates{7-OCT-1999,7-OCT-1999,7-OCT-1999,7-OCT-1999}
 
\documentclass{era-l}

\newcommand{\cT}{\mathcal{T}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\hP}{{\hat{P}}}
\newcommand{\ml}{\mbox{\scriptsize ML}}
\newcommand{\mdl}{\mbox{\scriptsize MDL}}
\newcommand{\kt}{\mbox{\scriptsize KT}}
\newcommand{\KT}{\mbox{KT}}
\newcommand{\hS}{\hat{\mathcal{S}}}
\newcommand{\bic}{\mbox{\scriptsize BIC}}
\newcommand{\goesto}{\rightarrow}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{thmbic}{The BIC consistency theorem}
\newtheorem*{thmtyp}{The typicality theorem}
\newtheorem*{thmin}{The inconsistency theorem}

\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}
\newtheorem{xca}[theorem]{Exercise}

\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\numberwithin{equation}{section}

\begin{document}

\issueinfo{5}{17}{}{1999}
\dateposted{October 19, 1999}
\pagespan{123}{127}
\PII{S 1079-6762(99)00070-0}
\def\copyrightyear{1999}
\copyrightinfo{1999}{American Mathematical Society}

\title{Consistency of the BIC order estimator}
\author{Imre Csisz\'ar}
\address{A. R\'enyi Institute of Mathematics, Hungarian 
Academy of Sciences, POB 127, 1364 Budapest, Hungary}
\email{csiszar@math-inst.hu}
\thanks{First author supported in part by a joint
NSF-Hungarian Academy grant 92}

\author{Paul C. Shields}
\address{Mathematics
Department, The University of Toledo, Toledo, OH 43606}
\email{paul.shields@utoledo.edu}
\thanks{Second author supported in part by a joint
NSF-Hungarian Academy grant INT-9515485}

\commby{Yitzhak Katznelson}
\subjclass{Primary 62F12, 62M05; Secondary 62F13, 60J10}

\date{February 25, 1999}

\keywords{Bayesian information criterion, order estimation, ratio-typicality, 
Markov chains}

\begin{abstract}
We announce two results on the problem
of estimating the order of a Markov chain from
observation of a sample path.   First is  that the
Bayesian Information Criterion (BIC) leads to an almost
surely consistent estimator. Second is that the
Bayesian minimum description length estimator, of
which the BIC estimator is an approximation, fails to
be  consistent for the  uniformly distributed i.i.d.\
process.  A key tool is a
strong  ratio-typicality  result for empirical
$k$-block distributions. Complete proofs are given in
the authors' article to appear in The Annals of 
Statistics. 
\end{abstract}

\maketitle

\section{Introduction} 
Let  $\cM_k$ denote the class of
Markov chains  of order at most
$k$, with values drawn from a
finite set $A$, and let
$\cM=\bigcup_{k=0}^{\infty}\cM_k$.
  An important problem is to estimate the order of a
Markov chain from observation of a finite sample
path.   A popular method is
 the so-called Bayesian Information
Criterion (BIC), first introduced  by
Schwarz,
 \cite{Schwarz}, which gives the estimator
defined by
\begin{equation}
 \hat{k}_{\bic}=\hat{k}_{\bic}(x_1^n)
=\arg\min_k\biggl(-\log
P_{\ml(k)}(x_1^n)+
     \frac{|A|^k(|A|-1)}{2}\log n\biggr),
 \label{BIC-estimator}    \end{equation}
where $P_{\ml(k)}(x_1^n)$ is the $k$-th order  maximum
likelihood, i.e., the largest probability given to
$x_1^n$ by processes in
$\cM_k$.   Our first result is the following.

\begin{thmbic} 
For any stationary, irreducible process in $\cM$,
$\hat{k}_{\bic}(X_1^n)$ is eventually almost
surely equal to the order of the process.
\end{thmbic}


Our theorem does  not assume  a finite
number of model classes. Claims of
 BIC consistency have been made by several
authors; see, for example,
\cite{Finesso,Barron-Rissanen-Yu} as well as
several papers listed in
\cite{Barron-Rissanen-Yu}.  All these papers
include the explicit or hidden assumption
that there are only a finite number of model
classes.   In this case, however, as noted in
\cite{Haughton,Finesso},  $\log n$ can be
replaced in the definition (\ref{BIC-estimator}) by
something much smaller, e.g., a suitable
multiple of $\log\log n$.
We  also point out that Kieffer
established consistency without an a priori
bound on the order for the case when
$|A|^k(|A|-1)$ is  replaced in (\ref{BIC-estimator})
by a more rapidly growing function of $k$,
\cite{Kieffer}.

An essential tool in proving the consistency
of the BIC order estimator is a strong
ratio-typicality result we establish for
stationary,  irreducible Markov
chains, a result that appears to be of independent
interest.  It says, loosely, that as long as $k$ does
not grow too rapidly with sample path length, the
ratio of the empirical relative frequency of
each $k$-block to its probability is uniformly
close to 1. The
 empirical distribution of
$k$-blocks in $x_1^n$,  or the {\em
$k$-type of $x_1^n$}, is defined by the formula
 \begin{equation}
  \hP_n(a_1^{k}) =
  \frac{1} {n-k+1}N(a_1^k|x_1^n), \;
a_1^{k}\in A^{k},
  \label{type-def}    \end{equation}
where $N(a_1^k|x_1^n)=|\{i\in [1,n-k+1]
\colon\ x_i^{i+k-1}=a_1^{k}\}|$ is the number of
occurrences of $a_1^k$ in $x_1^n$.

 The sequence
$x_1^n$ is called {\em
$(k,\epsilon)$-typical} for a process $Q$ if
$\hP_n(a_1^k)=0$, whenever $Q(a_1^k)=0$, and
 \begin{equation}
 \biggl|\frac{\hP_n(a_1^{k})}
 {Q(a_1^{k})}-1
\biggr| <\epsilon, \mbox{ whenever }
Q(a_1^{k})>0.
  \label{typical-def}    \end{equation}

\begin{thmtyp} For any stationary irreducible process
$Q\in \cM$, there exist positive numbers
$\alpha$ and
$\beta$ such that eventually almost surely
as $n\goesto
\infty$, the sequence
$x_1^n$ is
$(k,n^{-\beta})$-typical for every $ k \leq
\alpha\log n.$
\end{thmtyp}
 
Earlier typicality results  in which
block length grows with sample path length include
the following.
   \begin{enumerate}
 \item Marton and Shields have obtained large deviations
bounds for the variational distance between the
empirical $k$-block distribution and the theoretical
$k$-block distribution that are valid for all
$\displaystyle k\leq (\log n)(H+\epsilon)$, where
$H$ is the process entropy,
\cite{Marton-Shields}.  Their results do yield
our typicality theorem for $k=o(\log\log n)$, but
this is not sufficient for our proof of the BIC
consistency theorem.
 \item Flajolet, Kirschenhofer, and Tichy
have obtained similar results for the case
when $Q$ is the unbiased coin-tossing
process,
\cite{Flajolet-etal}.  They consider  longer
blocks but do not give an error rate.
   \end{enumerate}

The Bayesian framework that
underlies the BIC
starts  with a prior distribution
$\{p_k\}$ on the possible orders, together
with a prior distribution on
$\cM_k$, for each $k$; the latter defines a
mixture distribution, that is, a weighted
average of the processes in $\cM_k$.  The Bayesian
order estimator chooses that $k$ for which the product
of $p_k$ and the mixture probability of $x_1^n$ is
largest.
We consider here the mixture of those $k$-th order
Markov chains whose starting distribution is
uniform on $A^k$, taking  as prior the $|A|^k$-fold
product of the $(\frac{1}{2},\dots,\frac{1}{2})$
Dirichlet distributions put on the transition
probability matrices $Q(\cdot|\cdot)$. This mixture
distribution $\KT_k(x_1^n)$, whose explicit form is
given in (\ref{KT-k-def}), below, plays a distinguished
role in the theory of universal coding; see Krichevsky
and Trofimov,
\cite{Krichevsky-Trofimov}. It can be shown that
 \begin{equation}
-\log \KT_k(x_1^n)\approx-\log P_{\ml(k)}(x_1^n)+
     \frac{|A|^k(|A|-1)}{2}\log n,
 \label{KT-approx}    \end{equation}
 as $n\goesto\infty$ with $k$ fixed, which is the
principal reason for the estimator
(\ref{BIC-estimator}).

 In contrast to the BIC consistency theorem, for
the  Bayesian order estimator
   \begin{equation}
  \hat{k}_{\kt}(x_1^n)=\arg\min_k\biggl(-\log
p_k-\log
\mbox{KT}_k(x_1^n)\biggr),
  \label{Bayes-estimator}    \end{equation}
we  prove the following.

\begin{thmin} 
If $\{X_n\}$ is i.i.d.\ with $X_n$
uniformly distributed on $A$, then
$\hat{k}_{\kt}(x_1^n)\goesto+\infty$, almost
surely, provided  the  $p_k$ of
\eqref{Bayes-estimator} are taken, as usual, to
be slowly decreasing, say
$p_k=ck^{-2}$.
\end{thmin}

Similar phenomena have been
established in other Bayesian settings  by
Diaconis and Freedman; see, for example,
\cite{Diaconis-Freedman}.  The order
estimator $\hat{k}_{\kt}(x_1^n)$ is often
introduced via the minimum description length (MDL)
principle; see Rissanen,
\cite{Rissanen}, or Barron, Rissanen, and Yu,
\cite{Barron-Rissanen-Yu}.   It is interesting to
compare our inconsistency theorem with an
MDL-inspired result by Barron; see
\cite{Barron-Rissanen-Yu}.  That result, specialized to
the estimator $\hat{k}_{\kt}(x_1^n)$, can be formulated
as follows.   If the processes of  $\cM_k$ are
parametrized by an $|A|^k(|A|-1)$ dimensional parameter,
then for Lebesgue almost all choices of the parameter,
the estimator
$\hat{k}_{\kt}(x_1^n)$ is eventually almost surely
equal to the correct order.  Our inconsistency
theorem shows that  ``for almost every
choice of the parameter,'' is not
vacuous.

We note that the MDL principle also leads to
non-Bayesian estimators, replacing, e.g.,
mixture distributions by  normalized maximum
likelihoods; see \cite{Barron-Rissanen-Yu}. In our
context this means replacing $\KT_k(x_1^n)$ by
  \begin{equation*}
  P_{\ml(k)}(x_1^n)\biggl/\sum_{y_1^n\in
A^n}P_{\ml(k)}(y_1^n).
  \end{equation*}
The inconsistency theorem also holds for the
resulting order estimator.

\section{Sketches of the proofs}
Fix a stationary, irreducible Markov chain of order
$k_0$ and distribution $Q$.  In the sequel we use the notation
$N_n(a_1^m)=N(a_1^m|x_1^n)$.  The proof of the typicality theorem
utilizes the fact that for any
$m>k_0$, the sequence
   \begin{equation*}
   \{N_n(a_1^m)-N_{n-1}(a_1^{m-1})
Q(a_m|a_{m-k_0}^{m-1})\colon n\geq m\}
  \end{equation*}
  is a martingale.  A
direct calculation of the variance and an application
of  Kolmogorov's inequality for
martingales then give
  \begin{equation*}
  \mbox{Prob}\biggl\{
  \max_{n\leq 2^i}\biggl|
   N_{n}(a_1^{m})-
  N_{n-1}(a_1^{m-1})
  Q(a_{m}|a_{m-k_0}^{m-1})
\biggr|>\lambda\biggr\}<\frac{2^iQ(a_1^m)}
{\lambda^2}.
  \end{equation*} This, in turn, for suitable $\alpha>0$ and
$\beta>0 $ leads to a summable bound on the probability
of the event  that
  \begin{equation*}
  \max_{2^{i-1}2^{-i\beta}
  \end{equation*}
occurs for some
$k\in (k_0,\alpha \log 2^i]$ and
$a_1^k\in A^k$ with $Q(a_1^k)>0$.  The typicality
theorem follows because the assertion  is clearly true
for
$k=k_0$, by
 the law of the iterated logarithm for Markov chains.


Next, we use a counting argument  known in information
theory  as the method of types, to bound the
probability of the event that both
$\hat{k}_{\bic}(x_1^n)=k$ and $x_1^n$ is $(k+1,
n^{-\beta})$-typical. One role of typicality in our
argument is that the number of different $(k+1)$-types
that typical sequences $x_1^n$ can have admits a
better upper bound than the number of all possible
$(k+1)$-types. It can be shown that for the $\alpha$ and
$\beta$ of the typicality theorem and $n^*$ and $k^*$
large \pagebreak enough, the probability  that
$x_1^n$ is both $(k+1,n^{-\beta})$-typical and
$\hat{k}_{\bic}=k$, is upper bounded by  $n^{-2}$ for
every
$n\geq n^*$ and $k^*\leq k\leq \alpha \log n$.  It
follows that, eventually almost surely,
$\hat{k}_{\bic}(x_1^n)$ cannot belong to the
interval $(k^*, \alpha \log n)$.



  The key to ruling out larger values
of $k$ is that the
estimate (\ref{KT-approx}) substantially overestimates
$-\log \KT_k(x_1^n)$,  for large $n$ and $k$.
Indeed, using the explicit formula
\begin{equation}
   \mbox{KT}_k(x_1^n)=\frac{1}{|A|^k}
\prod_{a_1^k\in x_1^{n-1}} \left[
\frac{\prod_{a_{k+1}\colon a_1^{k+1}\in x_1^n}
(N_n(a_1^{k+1})-\frac{1}{2})(N_n(a_1^{k+1})-
\frac{3}{2})\cdots (\frac{1}{2})}
{(N_{n-1}(a_1^k)-1+\frac{|A|}{2})
(N_{n-1}(a_1^k)-2+\frac{|A|}{2})
\cdots(\frac{|A|}{2})}\right],
  \label{KT-k-def}    \end{equation}
 it can be shown that
 \begin{equation}
  \log \KT_k(x_1^n)\geq\log P_{\ml(k)}(x_1^n)
-\frac{|A|^k(|A|-1)}{2} \log^{+}
\frac{n}{|A|^k} - C|A|^k,
  \label{k-lower-bound}    \end{equation}
for a suitable constant $C$, independent of
$n$ and $k$.  This
easily leads to
  $
  \hat{k}_{\bic}(x_1^n)=o(\log n), \mbox{ a.s.}
  $
  The BIC consistency
theorem follows, since it is known (see \cite{Finesso})
that   $\hat{k}_{\bic}(x_1^n)\not\in
[0,k_0)\cup (k_0,k^*],
\mbox{ eventually a.s.}$



 Now let $Q$ denote the i.i.d.\ process with
$Q(a)=|A|^{-1}, a\in A$.  Two ideas are used to
establish the inconsistency theorem.  The first is
that if no $k$-block occurs in
$x_1^{n-1}$ more than once, then $\KT_k(x_1^n)=|A|^{-n}$.
This follows easily from  the representation
(\ref{KT-k-def}).  The second is that if $\alpha$
is large enough and $k=\alpha \log n$, the probability
that a
$k$-block appears more than once in $x_1^{n-1}$ is
upper bounded by $n^{-2}$.  The two ideas together
show that $\hat{k}_{\kt}(x_1^n)>0$, eventually almost
surely. Using a bound similar to
(\ref{k-lower-bound}), it can be shown that
$\hat{k}_{\kt}(x_1^n)\neq k$, eventually almost
surely, for any $k>0$, which combines with the
above to show that $\hat{k}_{\kt}(x_1^n)\goesto
\infty,$ almost surely.

\bibliographystyle{amsplain}
\begin{thebibliography}{10}
\bibitem{Barron-Rissanen-Yu} A. Barron, J. Rissanen, and B.
Yu, \textit{The minimum description length principle in
coding and modeling,}  IEEE Trans. Inform.
Th. 44 (1998), 2743--2760.
\MR{99h:94032}
\bibitem{Csiszar-Korner} I. Csisz\'{a}r  and J.
K\"{o}rner,  \textit{Information Theory. Coding theorems
for discrete memoryless systems},  Akad\'{e}miai
Kiad\'{o}, Budapest, 1981.
\MR{84e:94007}
\bibitem{Csiszar-Shields} I. Csisz\'{a}r  and P. Shields,
\textit{The consistency of the BIC  order estimator,} Ann. Statis.,
submitted.
\bibitem{Diaconis-Freedman} P. Diaconis and D.
Freedman, \textit{Nonparametric binary regression: a Bayesian
approach,} Ann. Statist. 21 (1993), 2108--2137.
\MR{94i:62001}
\bibitem{Finesso} L. Finesso, \textit{Estimation of the
order of a  finite Markov chain,} in \textit{Recent
Advances in the Mathematical Theory of Systems,
Control, and Network Signals, Proc. MTNS-91,} H.
Kimura and S. Kodama, Eds., Mita Press, 1992, pp. 643--645.
\MR{93g:93005}
\bibitem{Flajolet-etal} P.  Flajolet,
P. Kirschenhofer, and R. F. Tichy,
\textit{Deviations from uniformity  in random
strings,} Probab. Th. Rel. Fields 80 (1988),
139--150.
\MR{90a:11087}
\bibitem{Haughton} D. Haughton, \textit{On the choice of a model to fit
data from an exponential family,} Ann. Statist. 16 (1988),
342--355.
\MR{89e:62036}
\bibitem{Kieffer}  J. Kieffer, \textit{Strongly
consistent code-based identification and order estimation
for constrained finite-state model classes,} IEEE Trans.
Inform. Th. 39 (1993), 803--902.
\bibitem{Krichevsky-Trofimov} R. E. Krichevsky and 
V. K. Trofimov, \textit{The performance of universal
encoding,} IEEE Trans. Inform. Th. 27 (1981), 199--207.
\MR{83e:94030}
\bibitem{Marton-Shields} K. Marton and P. Shields,
\textit{Entropy and the consistent estimation of joint
distributions,}  Ann. Probab. 22 (1994),
960--977. (Correction, Ann. Probab. 24 (1996),
541--545.)
\MR{95g:94004};\MR{97c:94004}
\pagebreak 
\bibitem{Rissanen}  J. Rissanen, \textit{ Stochastic complexity
in statistical inquiry}, World Scientific, Singapore, 1989.
\MR{92f:68076}
\bibitem{Schwarz} G. Schwarz, \textit{Estimating the dimension of
a model,} Ann. Statist. 6 (1978), 461--464.
\MR{57:7855}
\end{thebibliography}

\end{document}