Thore Graepel's Home Page

 

Publications

Publications (no abstract)

guru-article.bib


@ARTICLE{grae97b,
  AUTHOR = {Thore Graepel and Matthias Burger and Klaus Obermayer},
  TITLE = {Phase Transitions in Stochastic Self-Organizing Maps},
  JOURNAL = {Physical Review E},
  YEAR = {1997},
  VOLUME = {56},
  NUMBER = {4},
  PAGES = {3876-3890},
  ABSTRACT = {We describe the development of neighborhood-preserving stochastic maps in terms of a probabilistic clustering problem. Starting from a cost function for central clustering that incorporates distortions from channel noise we derive a soft topographic vector quantization algorithm (STVQ) which is based on the maximum entropy principle and which maximizes the corresponding likelihood in an expectation-maximization (EM) fashion.
Among other algorithms a probabilistic version of Kohonen's self-organizing map (SOM) is derived from STVQ as a computationally  efficient approximation of the E-step. The foundation of STVQ in statistical physics motivates a deterministic annealing scheme in the temperature parameter $\beta$, and leads to a robust minimization algorithm of the clustering cost function.
In particular, this scheme offers an alternative to the common stepwise shrinking of the neighborhood width in the SOM and makes it possible to use its neighborhood function solely to encode the desired neighborhood relations between the clusters.
The annealing in $\beta$, which corresponds to a stepwise refinement of the resolution of representation in data space, leads to the splitting of an existing cluster representation during the ``cooling'' process. We describe this phase transition in terms of the covariance matrix {\bf C} of the data and the transition matrix {\bf H} of the channel noise and calculate the critical temperatures and modes as functions the eigenvalues and eigenvectors of {\bf C} and {\bf H}.
The analysis is extended to the phenomenon of the automatic selection of feature dimensions in dimension-reducing maps, thus leading to a ``batch''-alternative to the Fokker-Planck formalism for on-line learning. The results provide insights into the relation between the width of the neighborhood and the temperature parameter $\beta$: It is shown that the phase transition which leads to the representation of the excess-dimensions
can be triggered not only by a change in the statistics of the input data but also by an increase of $\beta$, which corresponds to a decrease in noise level. The theoretical results are validated by numerical methods. In particular, a quantity equivalent to the heat capacity in thermodynamics is introduced to visualize the properties of the annealing process.},
  URL = {http://stat.cs.tu-berlin.de/~guru/papers/graepel97_phys_rev_e_preprint.ps.gz}
}


@ARTICLE{grae98b,
  AUTHOR = {Thore Graepel and Matthias Burger and Klaus Obermayer},
  TITLE = {Self-Organizing Maps: Generalizations and New
          Optimization Techniques},
  JOURNAL = {Neurocomputing},
  YEAR = {1998},
  VOLUME = {20},
  PAGES = {173-190},
  ABSTRACT = {We offer three algorithms for the generation of
          topographic mappings to the practitioner of
          unsupervised data analysis. The algorithms are each
          based on the minimization of a cost function which
          is performed using an EM algorithm and deterministic
          annealing. The soft topographic vector quantization
          algorithm (STVQ) - like the
          original Self-Organizing Map (SOM) - provides a
          tool for the creation of self-organizing maps of
          Euclidean data. Its optimization scheme, however,
          offers an alternative to the heuristic stepwise
          shrinking of the neighborhood width in the SOM and
          makes it possible to use a fixed neighborhood
          function solely to encode desired neighborhood
          relations between nodes. The kernel-based soft topographic
          mapping (STMK) is a generalization
          of STVQ and introduces new distance measures in data
          space based on kernel functions. Using the new
          distance measures corresponds to performing the STVQ
          in a high-dimensional feature space, which is
          related to data space by a nonlinear mapping. This
          preprocessing can reveal structure of the data which
          may go unnoticed if the STVQ is performed in the
          standard Euclidean space. The soft topographic mapping for
          proximity data (STMP) is another
          generalization of STVQ that enables the user to
          generate topographic maps for data which are given
          in terms of pairwise proximities. It thus offers a
          flexible alternative to multidimensional scaling
          methods and opens up a new range of applications for
          Self-Organizing Maps. Both STMK and STMP share the robust
          optimization properties of STVQ
          due to the application of deterministic
          annealing. In our contribution we discuss the
          algorithms together with their implementation and
          provide detailed pseudo-code and explanations.},
  URL = {http://stat.cs.tu-berlin.de/~guru/papers/graepel98_neurocomp_SOM.ps.gz}
}


@ARTICLE{grae99a,
  AUTHOR = {Thore Graepel and Klaus Obermayer},
  TITLE = {A Self-Organizing Map for Proximity Data},
  JOURNAL = {Neural Computation},
  YEAR = {1999},
  VOLUME = {11},
  PAGES = {139-155},
  ABSTRACT = {We derive an efficient algorithm for topographic mapping of proximity data
(TMP), which can be seen as an extension of Kohonen's Self-Organizing Map to
arbitrary distance measures. The TMP cost function is derived in a Baysian
framework of Folded Markov Chains for the description of autoencoders. It incorporates the data via a dissimilarity matrix ${\mathcal D}$ and the topographic neighborhood via a matrix ${\mathcal H}$ of transition probabilities.
From the principle of Maximum Entropy a non-factorizing Gibbs-distribution is obtained, which is approximated in a mean-field fashion. This allows for Maximum Likelihood estimation using an EM algorithm. In analogy to the transition from Topographic Vector Quantization (TVQ) to the Self-organizing Map (SOM) we suggest an approximation to TMP which is computationally more efficient. In order to prevent convergence to local minima, an annealing scheme in the temperature parameter is introduced, for which the critical temperature of the first phase-transition is calculated in terms of ${\mathcal D}$ and ${\mathcal H}$.
Numerical results demonstrate the working of the algorithm and confirm the analytical results. Finally, the algorithm is used to generate a connection map of areas of the cat's cerebral cortex.},
  URL = {http://stat.cs.tu-berlin.de/~guru/papers/graepel98_tmp_nc.ps.gz}
}


@ARTICLE{HerGraCam01,
  AUTHOR = {Ralf Herbrich and Thore Graepel and Colin Campbell},
  TITLE = {Bayes Point Machines},
  JOURNAL = {Journal of Machine Learning Research},
  YEAR = {2001},
  PAGES = {245--279},
  VOLUME = {1},
  ABSTRACT = {Kernel-classifiers comprise a powerful class of non-linear decision functions for binary classification. The support vector machine is an example of a learning algorithm for kernel classifiers that singles out the consistent classifier with the largest margin, i.e. minimal real-valued output on the training sample, within the set of consistent hypotheses, the so-called version space. We suggest the Bayes point machine as a well-founded improvement which approximates the Bayes-optimal decision by the centre of mass of version space. We present two algorithms to stochastically approximate the centre of mass of version space: a billiard sampling algorithm and a sampling algorithm based on the well known perceptron algorithm. It is shown how both algorithms can be extended to allow for soft-boundaries in order to admit training errors. Experimentally, we find that - for the zero training error case - Bayes point machines consistently outperform support vector machines on both surrogate data and real-world benchmark data sets. In the soft-boundary/soft-margin case, the improvement over support vector machines is shown to be reduced. Finally, we demonstrate that the real-valued output of single Bayes points on novel test points is a valid confidence measure and leads to a steady decrease in generalisation error when used as a rejection criterion.},
  URL = {..\psfiles\HerGraCam01.ps.gz}
}


@MISC{HerGra02,
  AUTHOR = {Ralf Herbrich and Thore Graepel},
  TITLE = {A {PAC}-{B}ayesian margin bound for linear classifiers},
  JOURNAL = {IEEE Transactions on Information Theory},
  YEAR = {2002},
  NOTE = {to appear in IEEE Transactions on Information Theory},
  ABSTRACT = {We present a bound on the generalisation error of linear classifiers in terms of a refined margin quantity on the training sample. The result is obtained in a PAC-Bayesian framework and is based on geometrical arguments in the space of linear classifiers. The new bound constitutes an exponential improvement of the so far tightest margin bound, which was developed in the luckiness framework, and scales logarithmically in the inverse margin. Even in the case of less training examples than input dimensions sufficiently large margins lead to non-trivial bound values and - for maximum margins - to a vanishing complexity term. In contrast to previous results, however, the new bound does depend on the dimensionality of feature space. The analysis shows that the classical margin is too coarse a measure for the essential quantity that controls the generalisation error: the fraction of hypothesis space consistent with the training sample. The practical relevance of the result lies in the fact that the well-known support vector machine is optimal with respect to the new bound only if the feature vectors in the training sample are all of the same length. As a consequence we recommend to use SVMs on normalised feature vectors only. Numerical simulations support this recommendation and demonstrate that the new error bound can be used for the purpose of model selection.},
  URL = {..\psfiles\HerGra02.ps.gz}
}

horizontal rule

This file has been generated by bibtex2html 1.54