1 /*
2 * $Id: Algorithms.java,v 1.4 2010/01/11 21:22:46 pah Exp $
3 *
4 * Created on 8 Dec 2008 by Paul Harrison (paul.harrison@manchester.ac.uk)
5 * Copyright 2008 Astrogrid. All rights reserved.
6 *
7 * This software is published under the terms of the Astrogrid
8 * Software License, a copy of which has been included
9 * with this distribution in the LICENSE.txt file.
10 *
11 */
12
13 package org.astrogrid.matrix;
14
15 import static org.astrogrid.matrix.MatrixUtils.*;
16
17 import java.util.SortedMap;
18 import java.util.TreeMap;
19
20 import org.apache.commons.math.special.Gamma;
21 import org.netlib.blas.BLAS;
22 import org.netlib.blas.Dtrsv;
23 import org.netlib.lapack.LAPACK;
24
25 import no.uib.cipr.matrix.AGDenseMatrix;
26 import no.uib.cipr.matrix.DenseCholesky;
27 import no.uib.cipr.matrix.DenseMatrix;
28 import no.uib.cipr.matrix.DenseVector;
29 import no.uib.cipr.matrix.MatrixEntry;
30 import no.uib.cipr.matrix.Vector;
31
32 import static java.lang.Math.*;
33
34 /**
35 * Some general matrix based algorithms.
36 * @author Paul Harrison (paul.harrison@manchester.ac.uk) 8 Dec 2008
37 * @version $Name: $
38 * @since VOTech Stage 8
39 */
40 public class Algorithms {
41
42 public static final double eps= 2.2204e-16 ; //FIXME what is eps
43 /** logger for this class */
44 private static final org.apache.commons.logging.Log logger = org.apache.commons.logging.LogFactory
45 .getLog(Algorithms.class);
46
47 /** Comment for <code>kmeans</code>
48 * %KMEANS Trains a k means cluster model.
49
50 Description
51 CENTRES = KMEANS(CENTRES, DATA, OPTIONS) uses the batch K-means
52 algorithm to set the centres of a cluster model. The matrix DATA
53 represents the data which is being clustered, with each row
54 corresponding to a vector. The sum of squares error function is used.
55 The point at which a local minimum is achieved is returned as
56 CENTRES. The error value at that point is returned in OPTIONS(8).
57
58 [CENTRES, OPTIONS, POST, ERRLOG] = KMEANS(CENTRES, DATA, OPTIONS)
59 also returns the cluster number (in a one-of-N encoding) for each
60 data point in POST and a log of the error values after each cycle in
61 ERRLOG. The optional parameters have the following
62 interpretations.
63
64 OPTIONS(1) is set to 1 to display error values; also logs error
65 values in the return argument ERRLOG. If OPTIONS(1) is set to 0, then
66 only warning messages are displayed. If OPTIONS(1) is -1, then
67 nothing is displayed.
68
69 OPTIONS(2) absprec is a measure of the absolute precision required for the
70 value of CENTRES at the solution. If the absolute difference between
71 the values of CENTRES between two successive steps is less than
72 OPTIONS(2), then this condition is satisfied.
73
74 OPTIONS(3) errprec is a measure of the precision required of the error
75 function at the solution. If the absolute difference between the
76 error functions between two successive steps is less than OPTIONS(3),
77 then this condition is satisfied. Both this and the previous
78 condition must be satisfied for termination.
79
80 OPTIONS(14) is the maximum number of iterations; default 100.
81
82 */
83 public static void kmeans(Matrix centres, Matrix data, int niters, double absprec, double errprec, boolean fromData, boolean verbose) {
84
85 int ndata = data.numRows(), data_dim = data.numColumns();
86 int ncentres = centres.numRows(), dim= centres.numColumns();
87 double sumsq;
88
89 if (dim != data_dim)
90 {
91 logger.error("Data dimension does not match dimension of centres");
92 return; //TODO need to make this into some sort of exception.
93 }
94
95 if (ncentres > ndata){
96 logger.error("More centres than data");
97 return;
98 }
99
100
101 int store = 1;
102
103 Matrix errlog = (Matrix) new AGDenseMatrix(1, niters).zero();
104
105
106 // Check if centres and posteriors need to be initialised from data
107 if (fromData){
108 // Do the initialisation
109 int[] perm = randperm(ndata-1);
110
111 // Assign first ncentres (permuted) data points as centres
112 for (int i = 0; i < ncentres; i++) {
113 for(int j = 0; j < data_dim; j++)
114 {
115 centres.set(i, j, data.get(perm[i], j));
116 }
117 }
118 }
119
120 // Main loop of algorithm
121 Matrix old_centres = new AGDenseMatrix(ncentres, dim);
122 double e = 0, old_e = 0;
123
124 for(int n = 0; n < niters; n++){
125
126 // Save old centres to check for termination
127 old_centres.set(centres);
128
129 // Calculate posteriors based on existing centres
130 Matrix d2 = dist2(data, centres);
131 // Assign each point to nearest centre
132 MvRet mv = minvals(d2);
133
134
135 // Adjust the centres based on new posteriors
136 double[][] sums = new double[data_dim][ncentres];
137 int[] num_points = new int[ncentres];
138 e = 0;
139 for(int i = 0; i < ndata; i++)
140 {
141 for (int j = 0; j < data_dim; j++){
142 sums[j][mv.index[i]] += data.get(i, j);
143 }
144 num_points[mv.index[i]]++;
145 e += mv.minvals[i];
146 }
147 for (int j = 0; j < ncentres; j++){
148 if (num_points[j] > 0){
149 for (int i=0; i < data_dim; i++){
150 centres.set(j,i, sums[i][j]/num_points[j]);
151 }
152 }
153 }
154
155 // Error value is total squared distance from cluster centres
156 // if (store){
157 // errlog(n) = e;
158 // }
159 if (verbose){
160 System.out.printf( "Cycle %4d Error %11.6f\n", n, e);
161 }
162
163 if ( n > 1 ){
164 // Test for termination
165 if (old_centres.add(-1.0, centres).norm(Matrix.Norm.Maxvalue) < absprec &&
166 Math.abs(old_e - e) < errprec){
167 sumsq = e;
168 return;
169 }
170 }
171 old_e = e;
172 }
173
174 // If we get here, then we haven't terminated in the given number of
175 // iterations.
176 sumsq = e;
177 if (verbose){
178 logger.warn("Warning: Maximum number of iterations has been exceeded");
179 }
180
181
182
183 }
184
185
186 public static Matrix centre_kmeans(Matrix data, int nclus, int ndim){
187
188
189 Matrix centres= (Matrix) new AGDenseMatrix(nclus, ndim).zero();
190 kmeans(centres, data, 15, 0.001, 0.001, true, true);
191 return centres;
192
193 }
194
195 private static class MvRet {
196 int[] index;
197 double[] minvals;
198 }
199 private static MvRet minvals(Matrix d) {
200
201 MvRet retval = new MvRet();
202 retval.index = new int[d.numRows()];
203 retval.minvals = new double[d.numRows()];
204 for (int i = 0; i < d.numRows(); i++) {
205 int icolmin = 0;
206 retval.minvals[i] = d.get(i, 0);
207 for (int j = 1; j < d.numColumns(); j++){
208 double val = d.get(i, j);
209 if(retval.minvals[i] > val){
210 retval.minvals[i] = val;
211 icolmin = j;
212 }
213
214 }
215 retval.index[i] = icolmin;
216
217 }
218 return retval;
219 }
220
221 /**
222 * %DIST2 Calculates squared distance between two sets of points.
223
224 * Description
225 * D = DIST2(X, C) takes two matrices of vectors and calculates the
226 * squared Euclidean distance between them. Both matrices must be of
227 * the same column dimension. If X has M rows and N columns, and C has
228 * L rows and N columns, then the result has M rows and L columns. The
229 * I, Jth entry is the squared distance from the Ith row of X to the
230 * Jth row of C.
231 *
232
233 * @param data
234 * @param centres
235 * @return
236 */
237 static public Matrix dist2(Matrix x, Matrix c) {
238 int ndata= x.numRows(), dimx = x.numColumns() ;
239 int ncentres = c.numRows(), dimc = c.numColumns();
240 assert dimx == dimc:
241 "Data dimension does not match dimension of centres";
242 Matrix y = AGDenseMatrix.repeatColumn(x.pow(2).sum(2),ncentres);
243 Matrix d = AGDenseMatrix.repeatRow(c.pow(2).sum(2), ndata);
244 Matrix result = new AGDenseMatrix(ndata,ncentres);
245
246 x.transBmult(-2.0, c, result);
247 result.add(y.add(d));
248
249 // Rounding errors occasionally cause negative entries in n2
250 for (MatrixEntry e : result) {
251 if(e.get() < 0){
252 e.set(0.0);
253 }
254 }
255 return result;
256 }
257
258
259 /*
260 * slternative implentation.
261 * static Matrix dist2(Matrix x, Matrix c){
262
263 int ndata = x.numRows(), dimx =x.numColumns();
264 int ncentres = c.numRows(), dimc = c.numColumns();
265
266
267 if (x.numColumns() == c.numColumns()) {
268 Matrix retval = new AGDenseMatrix(ndata, ncentres);
269 Matrix ret2 = (Matrix) mult(sumsq(c), ones(1,ndata)).transpose(retval);
270 retval.add(mult(sumsq(x),ones(1, ncentres)));
271 retval.add(multBt(x, c).scale(-2.0));
272 return retval;
273
274
275 } else {
276 throw new IllegalArgumentException("each matrix must have he same number of columns");
277
278 }
279 }
280
281 */
282
283 /**
284 * RANDPERM Random permutation.
285 RANDPERM(n) is a random permutation of the integers from 1 to n.
286 For example, RANDPERM(6) might be [2 4 5 6 1 3].
287 * @param n
288 * @return
289 */
290 public static int[] randperm(int n){
291 SortedMap<Double, Integer> sortmap = new TreeMap<Double, Integer>();
292 for (int i = 0; i < n; i++) {
293 sortmap.put(Math.random(), i+1);
294 }
295
296 int[] retval = new int[n];
297 int i=0;
298 for (Integer ii : sortmap.values()) {
299 retval[i++] = ii;
300 }
301 return retval;
302
303 }
304
305
306 static public Matrix dist3_common(Matrix x, Matrix centres, Vector covars){
307
308 int ndata = x.numRows(), ndim = x.numColumns();
309 int K = centres.numRows(), T =centres.numColumns();
310 Matrix n2;
311 // Calculate squared norm matrix, of dimension (ndata, ncentres)
312 n2 = divide(dist2(x, centres) , repmatt(covars, ndata, 1));
313 return n2;
314
315 }
316
317 static public Matrix dist3_diag(Matrix x, Matrix centres, Matrix covars){
318 //I think that there is actually a mistake in the MatLab code and the Mahalanobis distance is not correctly computed....However trying to copy the existing code.
319 int ndata = x.numRows(), ndim = x.numColumns();
320 int K = centres.numRows(), T =centres.numColumns();
321 Matrix n2 = new AGDenseMatrix(ndata, K);
322 for (int i = 0; i < K ;i++){
323 Matrix diffs = sub(x , mult(ones(ndata, 1) , centres.sliceRowM(i)));
324 Vector sums = sum(divide(pow(diffs,2.0),(mult(ones(ndata, 1) ,
325 covars.sliceRowM(i)))), 2);
326 for (int j = 0; j < ndata ; j++)
327 n2.set(j, i, sums.get(j) );
328 }
329
330
331 return n2;
332 }
333 static public Matrix dist3_free(Matrix x, Matrix centres, Matrix[] covars){
334
335 int ndata = x.numRows(), ndim = x.numColumns();
336 int K = centres.numRows(), T =centres.numColumns();
337 Matrix n2 = new AGDenseMatrix(ndata, K);
338
339 for (int i = 0 ;i<K; i++){
340 AGDenseMatrix diffs = (AGDenseMatrix) sub(x , mult(ones(ndata, 1) , centres.sliceRowM(i)));
341 // Use Cholesky decomposition of covariance matrix to speed computation
342
343 DenseCholesky c = DenseCholesky.factorize(covars[i]);
344
345 no.uib.cipr.matrix.Matrix temp = new DenseMatrix(ndim, ndata);
346 transpose(c.getU()).solve(transpose(diffs), temp);
347
348 n2.setColumn(i,sum(times(temp,temp),1));
349 }
350 // n2(:,i) = sum(diffs
351
352 return n2;
353 }
354
355
356 public static double gamma(double s){
357 return Math.exp(Gamma.logGamma(s));
358 }
359
360 public static Vector mean(Matrix m, int i){
361 Vector retval = sum(m, i);
362 if(i == 1){
363 return retval.scale(1.0/m.numRows());
364 }else {
365 return retval.scale(1.0/m.numColumns());
366 }
367 }
368
369 public static int rem(int a, int b){
370 return a % b;
371 }
372
373
374 public static Vector multinorm(Matrix x, Vector m, Matrix covar ){
375 // Evaluates a multidimensional Gaussian
376 // of mean m and covariance matrix covar
377 // at the array of points x
378 //
379 int dim = x.numRows(), npoints=x.numColumns();
380 covar = add(covar, eye(dim,Double.MIN_VALUE));
381 double dd = det(covar);
382 Matrix in = inv(covar);
383 // covar
384 // pause
385 double ff = pow((2*PI),(-dim/2.0))*pow((dd),(-0.5));
386 Matrix centered = sub(x,repmat(m, 1, npoints));
387 Vector y;
388 if (dim != 1)
389 y = exp(sum(times(centered,(mult(in,centered)))).scale(-0.5)).scale(ff);
390 else
391 y = exp(mult(in,pow(centered,2)).asVector().scale(-0.5) ).scale(ff);
392 return y;
393
394 }
395 public static Vector t_multinorm(Matrix x, Vector m, Matrix covar, double v ){
396 // Evaluates a multidimensional student t- of mean m and covariance matrix covar
397 // at the array of points x
398 //
399 // y -- 1 x T
400 int dim = x.numRows(), npoints=x.numColumns();
401 covar = add(covar, eye(dim,Double.MIN_VALUE));
402 double dd = det(covar);
403 Matrix in = inv(covar);
404 double ff = gamma(0.5*(v+dim))/(pow(PI*v,0.5*dim) * gamma(v/2) * sqrt(dd));
405 Matrix centered = sub(x,repmat(m, 1, npoints));
406 Vector y = new DenseVector(npoints);
407 if (dim != 1){
408 for (int j = 0; j <npoints; j++){
409 double d = multAt(centered.sliceCol(j),mult(in,centered.sliceCol(j))).asScalar();
410 y.set(j, ff / ( pow(1 + d/v,0.5*(v + dim)) ));
411 }
412 //y = ff * exp(-0.5*sum(centered.*(in*centered)));
413 //y = ff * ( (1.0 + sum(centered.*(in*centered)) / v) .^(-0.5*(v+dim)) );
414 }
415 else
416 y = pow(add(1.0 , pow(mult(in,centered).asVector(),2).scale(1.0/v)) , (-0.5*(v+dim)) ).scale(ff);
417
418 return y;
419 }
420
421
422 }
423
424
425 /*
426 * $Log: Algorithms.java,v $
427 * Revision 1.4 2010/01/11 21:22:46 pah
428 * reasonable numerical stability and fidelity to MATLAB results achieved
429 *
430 * Revision 1.3 2009/09/20 17:18:01 pah
431 * checking just prior to bham visit
432 *
433 * Revision 1.2 2009/09/14 19:08:42 pah
434 * code runs clustering, but not giving same results as matlab exactly
435 *
436 * Revision 1.1 2009/09/07 16:06:12 pah
437 * initial transcription of the core
438 *
439 */