|
Thore Graepel's Home Page
|
|
|
guru-article.bib@ARTICLE{grae97b, AUTHOR = {Thore Graepel and Matthias Burger and Klaus Obermayer}, TITLE = {Phase Transitions in Stochastic Self-Organizing Maps}, JOURNAL = {Physical Review E}, YEAR = {1997}, VOLUME = {56}, NUMBER = {4}, PAGES = {3876-3890}, ABSTRACT = {We describe the development of neighborhood-preserving stochastic maps in terms of a probabilistic clustering problem. Starting from a cost function for central clustering that incorporates distortions from channel noise we derive a soft topographic vector quantization algorithm (STVQ) which is based on the maximum entropy principle and which maximizes the corresponding likelihood in an expectation-maximization (EM) fashion. Among other algorithms a probabilistic version of Kohonen's self-organizing map (SOM) is derived from STVQ as a computationally efficient approximation of the E-step. The foundation of STVQ in statistical physics motivates a deterministic annealing scheme in the temperature parameter $\beta$, and leads to a robust minimization algorithm of the clustering cost function. In particular, this scheme offers an alternative to the common stepwise shrinking of the neighborhood width in the SOM and makes it possible to use its neighborhood function solely to encode the desired neighborhood relations between the clusters. The annealing in $\beta$, which corresponds to a stepwise refinement of the resolution of representation in data space, leads to the splitting of an existing cluster representation during the ``cooling'' process. We describe this phase transition in terms of the covariance matrix {\bf C} of the data and the transition matrix {\bf H} of the channel noise and calculate the critical temperatures and modes as functions the eigenvalues and eigenvectors of {\bf C} and {\bf H}. The analysis is extended to the phenomenon of the automatic selection of feature dimensions in dimension-reducing maps, thus leading to a ``batch''-alternative to the Fokker-Planck formalism for on-line learning. The results provide insights into the relation between the width of the neighborhood and the temperature parameter $\beta$: It is shown that the phase transition which leads to the representation of the excess-dimensions can be triggered not only by a change in the statistics of the input data but also by an increase of $\beta$, which corresponds to a decrease in noise level. The theoretical results are validated by numerical methods. In particular, a quantity equivalent to the heat capacity in thermodynamics is introduced to visualize the properties of the annealing process.}, URL = {http://stat.cs.tu-berlin.de/~guru/papers/graepel97_phys_rev_e_preprint.ps.gz} } @ARTICLE{grae98b, AUTHOR = {Thore Graepel and Matthias Burger and Klaus Obermayer}, TITLE = {Self-Organizing Maps: Generalizations and New Optimization Techniques}, JOURNAL = {Neurocomputing}, YEAR = {1998}, VOLUME = {20}, PAGES = {173-190}, ABSTRACT = {We offer three algorithms for the generation of topographic mappings to the practitioner of unsupervised data analysis. The algorithms are each based on the minimization of a cost function which is performed using an EM algorithm and deterministic annealing. The soft topographic vector quantization algorithm (STVQ) - like the original Self-Organizing Map (SOM) - provides a tool for the creation of self-organizing maps of Euclidean data. Its optimization scheme, however, offers an alternative to the heuristic stepwise shrinking of the neighborhood width in the SOM and makes it possible to use a fixed neighborhood function solely to encode desired neighborhood relations between nodes. The kernel-based soft topographic mapping (STMK) is a generalization of STVQ and introduces new distance measures in data space based on kernel functions. Using the new distance measures corresponds to performing the STVQ in a high-dimensional feature space, which is related to data space by a nonlinear mapping. This preprocessing can reveal structure of the data which may go unnoticed if the STVQ is performed in the standard Euclidean space. The soft topographic mapping for proximity data (STMP) is another generalization of STVQ that enables the user to generate topographic maps for data which are given in terms of pairwise proximities. It thus offers a flexible alternative to multidimensional scaling methods and opens up a new range of applications for Self-Organizing Maps. Both STMK and STMP share the robust optimization properties of STVQ due to the application of deterministic annealing. In our contribution we discuss the algorithms together with their implementation and provide detailed pseudo-code and explanations.}, URL = {http://stat.cs.tu-berlin.de/~guru/papers/graepel98_neurocomp_SOM.ps.gz} } @ARTICLE{grae99a, AUTHOR = {Thore Graepel and Klaus Obermayer}, TITLE = {A Self-Organizing Map for Proximity Data}, JOURNAL = {Neural Computation}, YEAR = {1999}, VOLUME = {11}, PAGES = {139-155}, ABSTRACT = {We derive an efficient algorithm for topographic mapping of proximity data (TMP), which can be seen as an extension of Kohonen's Self-Organizing Map to arbitrary distance measures. The TMP cost function is derived in a Baysian framework of Folded Markov Chains for the description of autoencoders. It incorporates the data via a dissimilarity matrix ${\mathcal D}$ and the topographic neighborhood via a matrix ${\mathcal H}$ of transition probabilities. From the principle of Maximum Entropy a non-factorizing Gibbs-distribution is obtained, which is approximated in a mean-field fashion. This allows for Maximum Likelihood estimation using an EM algorithm. In analogy to the transition from Topographic Vector Quantization (TVQ) to the Self-organizing Map (SOM) we suggest an approximation to TMP which is computationally more efficient. In order to prevent convergence to local minima, an annealing scheme in the temperature parameter is introduced, for which the critical temperature of the first phase-transition is calculated in terms of ${\mathcal D}$ and ${\mathcal H}$. Numerical results demonstrate the working of the algorithm and confirm the analytical results. Finally, the algorithm is used to generate a connection map of areas of the cat's cerebral cortex.}, URL = {http://stat.cs.tu-berlin.de/~guru/papers/graepel98_tmp_nc.ps.gz} } @ARTICLE{HerGraCam01, AUTHOR = {Ralf Herbrich and Thore Graepel and Colin Campbell}, TITLE = {Bayes Point Machines}, JOURNAL = {Journal of Machine Learning Research}, YEAR = {2001}, PAGES = {245--279}, VOLUME = {1}, ABSTRACT = {Kernel-classifiers comprise a powerful class of non-linear decision functions for binary classification. The support vector machine is an example of a learning algorithm for kernel classifiers that singles out the consistent classifier with the largest margin, i.e. minimal real-valued output on the training sample, within the set of consistent hypotheses, the so-called version space. We suggest the Bayes point machine as a well-founded improvement which approximates the Bayes-optimal decision by the centre of mass of version space. We present two algorithms to stochastically approximate the centre of mass of version space: a billiard sampling algorithm and a sampling algorithm based on the well known perceptron algorithm. It is shown how both algorithms can be extended to allow for soft-boundaries in order to admit training errors. Experimentally, we find that - for the zero training error case - Bayes point machines consistently outperform support vector machines on both surrogate data and real-world benchmark data sets. In the soft-boundary/soft-margin case, the improvement over support vector machines is shown to be reduced. Finally, we demonstrate that the real-valued output of single Bayes points on novel test points is a valid confidence measure and leads to a steady decrease in generalisation error when used as a rejection criterion.}, URL = {..\psfiles\HerGraCam01.ps.gz} } @MISC{HerGra02, AUTHOR = {Ralf Herbrich and Thore Graepel}, TITLE = {A {PAC}-{B}ayesian margin bound for linear classifiers}, JOURNAL = {IEEE Transactions on Information Theory}, YEAR = {2002}, NOTE = {to appear in IEEE Transactions on Information Theory}, ABSTRACT = {We present a bound on the generalisation error of linear classifiers in terms of a refined margin quantity on the training sample. The result is obtained in a PAC-Bayesian framework and is based on geometrical arguments in the space of linear classifiers. The new bound constitutes an exponential improvement of the so far tightest margin bound, which was developed in the luckiness framework, and scales logarithmically in the inverse margin. Even in the case of less training examples than input dimensions sufficiently large margins lead to non-trivial bound values and - for maximum margins - to a vanishing complexity term. In contrast to previous results, however, the new bound does depend on the dimensionality of feature space. The analysis shows that the classical margin is too coarse a measure for the essential quantity that controls the generalisation error: the fraction of hypothesis space consistent with the training sample. The practical relevance of the result lies in the fact that the well-known support vector machine is optimal with respect to the new bound only if the feature vectors in the training sample are all of the same length. As a consequence we recommend to use SVMs on normalised feature vectors only. Numerical simulations support this recommendation and demonstrate that the new error bound can be used for the purpose of model selection.}, URL = {..\psfiles\HerGra02.ps.gz} }
|