View Javadoc

1   /*
2    * $Id: Algorithms.java,v 1.4 2010/01/11 21:22:46 pah Exp $
3    * 
4    * Created on 8 Dec 2008 by Paul Harrison (paul.harrison@manchester.ac.uk)
5    * Copyright 2008 Astrogrid. All rights reserved.
6    *
7    * This software is published under the terms of the Astrogrid 
8    * Software License, a copy of which has been included 
9    * with this distribution in the LICENSE.txt file.  
10   *
11   */ 
12  
13  package org.astrogrid.matrix;
14  
15  import static org.astrogrid.matrix.MatrixUtils.*;
16  
17  import java.util.SortedMap;
18  import java.util.TreeMap;
19  
20  import org.apache.commons.math.special.Gamma;
21  import org.netlib.blas.BLAS;
22  import org.netlib.blas.Dtrsv;
23  import org.netlib.lapack.LAPACK;
24  
25  import no.uib.cipr.matrix.AGDenseMatrix;
26  import no.uib.cipr.matrix.DenseCholesky;
27  import no.uib.cipr.matrix.DenseMatrix;
28  import no.uib.cipr.matrix.DenseVector;
29  import no.uib.cipr.matrix.MatrixEntry;
30  import no.uib.cipr.matrix.Vector;
31  
32  import static java.lang.Math.*;
33  
34  /**
35   * Some general matrix based algorithms.
36   * @author Paul Harrison (paul.harrison@manchester.ac.uk) 8 Dec 2008
37   * @version $Name:  $
38   * @since VOTech Stage 8
39   */
40  public class Algorithms {
41  
42      public static final double eps= 2.2204e-16 ; //FIXME what is eps
43      /** logger for this class */
44      private static final org.apache.commons.logging.Log logger = org.apache.commons.logging.LogFactory
45              .getLog(Algorithms.class);
46      
47      /** Comment for <code>kmeans</code>
48       * %KMEANS  Trains a k means cluster model.
49  
50         Description
51          CENTRES = KMEANS(CENTRES, DATA, OPTIONS) uses the batch K-means
52         algorithm to set the centres of a cluster model. The matrix DATA
53         represents the data which is being clustered, with each row
54         corresponding to a vector. The sum of squares error function is used.
55         The point at which a local minimum is achieved is returned as
56         CENTRES.  The error value at that point is returned in OPTIONS(8).
57  
58         [CENTRES, OPTIONS, POST, ERRLOG] = KMEANS(CENTRES, DATA, OPTIONS)
59         also returns the cluster number (in a one-of-N encoding) for each
60         data point in POST and a log of the error values after each cycle in
61         ERRLOG.    The optional parameters have the following
62         interpretations.
63  
64         OPTIONS(1) is set to 1 to display error values; also logs error
65         values in the return argument ERRLOG. If OPTIONS(1) is set to 0, then
66         only warning messages are displayed.  If OPTIONS(1) is -1, then
67         nothing is displayed.
68  
69         OPTIONS(2) absprec is a measure of the absolute precision required for the
70         value of CENTRES at the solution.  If the absolute difference between
71         the values of CENTRES between two successive steps is less than
72         OPTIONS(2), then this condition is satisfied.
73  
74         OPTIONS(3) errprec is a measure of the precision required of the error
75         function at the solution.  If the absolute difference between the
76         error functions between two successive steps is less than OPTIONS(3),
77         then this condition is satisfied. Both this and the previous
78         condition must be satisfied for termination.
79  
80         OPTIONS(14) is the maximum number of iterations; default 100.
81  
82       */
83      public static void kmeans(Matrix centres, Matrix data, int niters, double absprec, double errprec, boolean fromData, boolean verbose) {
84   
85          int ndata = data.numRows(), data_dim = data.numColumns();
86          int ncentres = centres.numRows(), dim= centres.numColumns();
87          double sumsq;
88  
89          if (dim != data_dim)
90          {
91            logger.error("Data dimension does not match dimension of centres");
92            return; //TODO need to make this into some sort of exception.
93          }
94  
95          if (ncentres > ndata){
96            logger.error("More centres than data");
97            return;
98          }
99  
100  
101         int store = 1;
102        
103          Matrix errlog = (Matrix) new AGDenseMatrix(1, niters).zero();
104         
105 
106         // Check if centres and posteriors need to be initialised from data
107         if (fromData){
108           // Do the initialisation
109           int[] perm = randperm(ndata-1);
110 
111           // Assign first ncentres (permuted) data points as centres
112           for (int i = 0; i < ncentres; i++) {
113             for(int j = 0; j < data_dim; j++)
114             {
115               centres.set(i, j, data.get(perm[i], j));   
116             }
117             }
118         }
119 
120         // Main loop of algorithm
121         Matrix old_centres = new AGDenseMatrix(ncentres, dim);
122         double e = 0, old_e = 0;
123 
124         for(int n = 0; n < niters; n++){
125 
126           // Save old centres to check for termination 
127           old_centres.set(centres);
128           
129           // Calculate posteriors based on existing centres
130           Matrix d2 = dist2(data, centres);
131           // Assign each point to nearest centre
132           MvRet mv = minvals(d2);
133           
134 
135           // Adjust the centres based on new posteriors          
136           double[][] sums = new double[data_dim][ncentres];
137           int[] num_points = new int[ncentres];
138           e = 0;
139           for(int i = 0; i < ndata; i++)
140           {
141               for (int j = 0; j < data_dim; j++){
142                   sums[j][mv.index[i]] += data.get(i, j);
143               }
144               num_points[mv.index[i]]++;
145               e += mv.minvals[i];
146           }
147           for (int j = 0; j < ncentres; j++){
148               if (num_points[j] > 0){
149                   for (int i=0; i < data_dim; i++){
150                     centres.set(j,i, sums[i][j]/num_points[j]);
151                   }
152               }
153             }
154 
155           // Error value is total squared distance from cluster centres
156 //          if (store){
157 //            errlog(n) = e;
158 //          }
159           if (verbose){
160             System.out.printf( "Cycle %4d  Error %11.6f\n", n, e);
161           }
162 
163           if ( n > 1 ){
164             // Test for termination
165             if (old_centres.add(-1.0, centres).norm(Matrix.Norm.Maxvalue) < absprec && 
166                 Math.abs(old_e - e) < errprec){
167               sumsq = e;
168               return;
169             }
170           }
171           old_e = e;
172          }
173 
174         // If we get here, then we haven't terminated in the given number of 
175         // iterations.
176         sumsq = e;
177         if (verbose){
178           logger.warn("Warning: Maximum number of iterations has been exceeded");
179         }
180 
181 
182         
183     }
184     
185     
186     public static Matrix centre_kmeans(Matrix data, int nclus, int ndim){
187         
188         
189         Matrix centres= (Matrix) new AGDenseMatrix(nclus, ndim).zero();
190         kmeans(centres, data, 15, 0.001, 0.001, true, true);
191         return centres;
192         
193     }
194     
195     private static class MvRet {
196         int[] index;
197         double[] minvals;
198     }
199     private static  MvRet minvals(Matrix d) {
200      
201         MvRet retval = new MvRet();
202         retval.index = new int[d.numRows()];
203         retval.minvals = new double[d.numRows()];
204         for (int i = 0; i < d.numRows(); i++) {
205             int icolmin = 0;
206             retval.minvals[i] = d.get(i, 0);
207             for (int j = 1; j < d.numColumns(); j++){
208                 double val = d.get(i, j);
209                 if(retval.minvals[i] > val){
210                     retval.minvals[i] = val;
211                     icolmin = j;
212                 }
213                 
214             }
215             retval.index[i] = icolmin;
216             
217         }
218         return retval;
219     }
220     
221     /**
222      * %DIST2   Calculates squared distance between two sets of points.
223 
224 *       Description
225 *       D = DIST2(X, C) takes two matrices of vectors and calculates the
226 *       squared Euclidean distance between them.  Both matrices must be of
227 *       the same column dimension.  If X has M rows and N columns, and C has
228 *       L rows and N columns, then the result has M rows and L columns.  The
229 *       I, Jth entry is the  squared distance from the Ith row of X to the
230 *       Jth row of C.
231 *
232 
233      * @param data
234      * @param centres
235      * @return
236      */
237     static public Matrix dist2(Matrix x, Matrix c) {
238         int ndata= x.numRows(), dimx = x.numColumns() ;
239         int ncentres = c.numRows(), dimc = c.numColumns();
240         assert dimx == dimc:
241                 "Data dimension does not match dimension of centres";
242         Matrix y = AGDenseMatrix.repeatColumn(x.pow(2).sum(2),ncentres);
243         Matrix d = AGDenseMatrix.repeatRow(c.pow(2).sum(2), ndata);
244         Matrix result = new AGDenseMatrix(ndata,ncentres);
245         
246         x.transBmult(-2.0, c, result);
247         result.add(y.add(d));
248         
249         // Rounding errors occasionally cause negative entries in n2
250         for (MatrixEntry e : result) {
251             if(e.get() < 0){
252                 e.set(0.0);
253             }
254         }
255         return result;
256     }
257     
258     
259     /*
260      * slternative implentation.
261      *    static Matrix dist2(Matrix x, Matrix c){
262         
263         int ndata = x.numRows(), dimx =x.numColumns();
264         int ncentres = c.numRows(), dimc = c.numColumns();
265 
266 
267         if (x.numColumns() == c.numColumns()) {
268             Matrix retval = new AGDenseMatrix(ndata, ncentres);
269             Matrix ret2 = (Matrix) mult(sumsq(c), ones(1,ndata)).transpose(retval);
270             retval.add(mult(sumsq(x),ones(1, ncentres)));
271             retval.add(multBt(x, c).scale(-2.0));
272             return retval;
273             
274             
275         } else {
276             throw new IllegalArgumentException("each matrix must have he same number of columns");
277 
278         }
279     }
280 
281      */
282 
283     /**
284      * RANDPERM Random permutation.
285       RANDPERM(n) is a random permutation of the integers from 1 to n.
286       For example, RANDPERM(6) might be [2 4 5 6 1 3].
287      * @param n
288      * @return
289      */
290     public static int[] randperm(int n){
291       SortedMap<Double, Integer> sortmap = new TreeMap<Double, Integer>();
292       for (int i = 0; i < n; i++) {
293         sortmap.put(Math.random(), i+1);
294     }
295       
296       int[] retval = new int[n];
297       int i=0;
298       for (Integer ii : sortmap.values()) {
299         retval[i++] = ii;
300     }
301       return retval;
302        
303     }
304     
305     
306     static public Matrix dist3_common(Matrix x, Matrix centres, Vector covars){
307      
308         int ndata = x.numRows(), ndim = x.numColumns();
309         int K = centres.numRows(), T =centres.numColumns();
310         Matrix n2;
311         // Calculate squared norm matrix, of dimension (ndata, ncentres)
312         n2 = divide(dist2(x, centres) , repmatt(covars, ndata, 1));
313         return n2;
314 
315     }
316  
317     static public Matrix dist3_diag(Matrix x, Matrix centres, Matrix covars){
318       //I think that there is actually a mistake in the MatLab code and the Mahalanobis distance is not correctly computed....However trying to copy the existing code.
319         int ndata = x.numRows(), ndim = x.numColumns();
320         int K = centres.numRows(), T =centres.numColumns();
321         Matrix n2 = new AGDenseMatrix(ndata, K);
322         for (int i = 0; i < K ;i++){
323         Matrix diffs = sub(x , mult(ones(ndata, 1) , centres.sliceRowM(i)));
324         Vector sums = sum(divide(pow(diffs,2.0),(mult(ones(ndata, 1) , 
325                 covars.sliceRowM(i)))), 2);
326         for (int j = 0; j < ndata ; j++)
327         n2.set(j, i, sums.get(j) );  
328         }
329        
330         
331         return n2;
332     }
333     static public Matrix dist3_free(Matrix x, Matrix centres, Matrix[] covars){
334         
335         int ndata = x.numRows(), ndim = x.numColumns();
336         int K = centres.numRows(), T =centres.numColumns();
337         Matrix n2 = new AGDenseMatrix(ndata, K);
338          
339                 for (int i = 0 ;i<K; i++){
340                     AGDenseMatrix diffs = (AGDenseMatrix) sub(x , mult(ones(ndata, 1) , centres.sliceRowM(i)));
341                     // Use Cholesky decomposition of covariance matrix to speed computation
342                     
343                      DenseCholesky c = DenseCholesky.factorize(covars[i]);
344                     
345                      no.uib.cipr.matrix.Matrix temp = new DenseMatrix(ndim, ndata);
346                     transpose(c.getU()).solve(transpose(diffs), temp);
347                     
348                     n2.setColumn(i,sum(times(temp,temp),1));
349                     }
350         //             n2(:,i) = sum(diffs 
351                 
352         return n2;
353         }
354     
355 
356     public static double gamma(double s){
357       return  Math.exp(Gamma.logGamma(s));
358     }
359     
360     public static Vector mean(Matrix m, int i){
361         Vector retval = sum(m, i);
362         if(i == 1){
363             return retval.scale(1.0/m.numRows());
364         }else {
365             return retval.scale(1.0/m.numColumns());
366         }
367     }
368     
369     public static int rem(int a, int b){
370         return a % b;
371     }
372     
373     
374     public static Vector multinorm(Matrix x, Vector m, Matrix covar ){
375         // Evaluates a multidimensional Gaussian
376         // of mean m and covariance matrix covar
377         // at the array of points x
378         //
379         int dim = x.numRows(), npoints=x.numColumns();
380         covar = add(covar, eye(dim,Double.MIN_VALUE));
381         double dd = det(covar);
382         Matrix in = inv(covar);
383         // covar
384         // pause
385         double ff = pow((2*PI),(-dim/2.0))*pow((dd),(-0.5));
386         Matrix centered = sub(x,repmat(m, 1, npoints));
387         Vector y;
388         if (dim != 1)
389            y =  exp(sum(times(centered,(mult(in,centered)))).scale(-0.5)).scale(ff);
390         else
391            y =  exp(mult(in,pow(centered,2)).asVector().scale(-0.5) ).scale(ff);
392         return y;
393 
394     }
395     public static Vector t_multinorm(Matrix x, Vector m, Matrix covar, double v ){
396         // Evaluates a multidimensional student t- of mean m and covariance matrix covar
397         // at the array of points x
398         //
399         //   y -- 1 x T
400         int dim = x.numRows(), npoints=x.numColumns();
401         covar = add(covar, eye(dim,Double.MIN_VALUE));
402         double dd = det(covar);
403         Matrix in = inv(covar);
404         double ff = gamma(0.5*(v+dim))/(pow(PI*v,0.5*dim) * gamma(v/2) * sqrt(dd));
405         Matrix centered = sub(x,repmat(m, 1, npoints));
406         Vector y = new DenseVector(npoints);
407         if (dim != 1){
408             for (int j  = 0; j <npoints; j++){
409                 double d = multAt(centered.sliceCol(j),mult(in,centered.sliceCol(j))).asScalar();
410                 y.set(j, ff / (  pow(1 + d/v,0.5*(v + dim))    ));
411             }
412             //y = ff * exp(-0.5*sum(centered.*(in*centered)));
413             //y = ff * ( (1.0 + sum(centered.*(in*centered)) / v) .^(-0.5*(v+dim)) );
414         }
415         else
416            y =  pow(add(1.0 , pow(mult(in,centered).asVector(),2).scale(1.0/v)) , (-0.5*(v+dim)) ).scale(ff);
417         
418       return y;
419     }
420        
421     
422 }
423 
424 
425 /*
426  * $Log: Algorithms.java,v $
427  * Revision 1.4  2010/01/11 21:22:46  pah
428  * reasonable numerical stability and fidelity to MATLAB results achieved
429  *
430  * Revision 1.3  2009/09/20 17:18:01  pah
431  * checking just prior to bham visit
432  *
433  * Revision 1.2  2009/09/14 19:08:42  pah
434  * code runs clustering, but not giving same results as matlab exactly
435  *
436  * Revision 1.1  2009/09/07 16:06:12  pah
437  * initial transcription of the core
438  *
439  */