KMeans.h
Go to the documentation of this file.
1 /* Copyright (C) 2008 National Institute For Space Research (INPE) - Brazil.
2 
3  This file is part of the TerraLib - a Framework for building GIS enabled applications.
4 
5  TerraLib is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published by
7  the Free Software Foundation, either version 3 of the License,
8  or (at your option) any later version.
9 
10  TerraLib is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with TerraLib. See COPYING. If not, write to
17  TerraLib Team at <terralib-team@terralib.org>.
18  */
19 
20 /*!
21  \file terralib/classification/KMeans.h
22 
23  \brief KMeans strategy for classification.
24 */
25 
26 #ifndef __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
27 #define __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
28 
29 // TerraLib
30 #include "../common/AbstractParameters.h"
31 #include "../common/progress/TaskProgress.h"
32 #include "../raster/Grid.h"
33 #include "Config.h"
34 #include "Exception.h"
35 #include "Utils.h"
36 
37 // STL
38 #include <iostream>
39 #include <cmath>
40 #include <stdlib.h>
41 #include <map>
42 #include <vector>
43 
44 namespace te
45 {
46  namespace cl
47  {
48  /*!
49  \class KMeans
50 
51  \brief KMeans strategy for classification.
52  Step-by-step:
53  1. Define a random solution, creating K values of mean.
54  2. Classify each input value according to the closest mean.
55  3. Calculate new means using the classified pixels.
56  4. Go back to step 2, or stop depending on the following checks:
57  4.1. Check if the new means are different from the previous iteration,
58  using the convergence threshold.
59  4.2. Check if maximum number of iterations has achieved.
60  \note Output classification IDs starting at 1.
61  */
62  template<class TTRAIN, class TCLASSIFY>
63  class KMeans
64  {
65  public:
66 
67  /*!
68  \class Parameters
69 
70  \brief Classifier Parameters
71  */
73  {
74  public:
75 
76  unsigned int m_K; //!< The number of clusters (means) to detect in image.
77  unsigned int m_maxIterations; //!< The maximum of iterations to perform if convergence is not achieved.
78  double m_epsilon; //!< The stop criteria. When the clusters change in a value smaller then epsilon, the convergence is achieved.
79 
80  Parameters();
81 
82  ~Parameters();
83 
84  //overload
85  const Parameters& operator=(const Parameters& params);
86 
87  //overload
88  void reset() throw(te::cl::Exception);
89 
90  //overload
91  AbstractParameters* clone() const;
92  };
93 
94  public:
95 
96  KMeans();
97 
98  ~KMeans();
99 
100  bool initialize(const Parameters& params) throw(te::cl::Exception);
101 
102  bool train(TTRAIN& itBegin, TTRAIN& itEnd,
103  const std::vector<unsigned int>& attributesIndices,
104  const std::vector<unsigned int>& labels,
105  const bool enableProgressInterface) throw(te::cl::Exception);
106 
107  bool classify(TCLASSIFY& itBegin, TCLASSIFY& itEnd,
108  const std::vector<unsigned int>& attributesIndices,
109  std::vector<unsigned int>& classification,
110  const bool enableProgressInterface) throw(te::cl::Exception);
111 
112  protected:
113 
114  unsigned int getClassification(std::vector<double> values);
115 
116  bool m_isInitialized; //!< True if this instance is initialized.
117  std::vector<std::vector<double> > m_KMeans; //!< The vector of K means.
118  Parameters m_parameters; //!< Internal execution parameters.
119  };
120 
121  } // end namespace cl
122 } // end namespace te
123 
124 // class of Parameters
125 template <class TTRAIN, class TCLASSIFY>
126 te::cl::KMeans<TTRAIN, TCLASSIFY>::Parameters::Parameters()
127 {
128  reset();
129 }
130 
131 template <class TTRAIN, class TCLASSIFY>
133 {
134 }
135 
136 template <class TTRAIN, class TCLASSIFY>
138 {
139  reset();
140 
141  m_K = rhs.m_K;
143  m_epsilon = rhs.m_epsilon;
144 
145  return *this;
146 }
147 
148 template <class TTRAIN, class TCLASSIFY>
150 {
151  m_K = 0;
152  m_maxIterations = 0;
153  m_epsilon = 0.0;
154 }
155 
156 template <class TTRAIN, class TCLASSIFY>
158 {
160 }
161 
162 // class KMeans Strategy
163 template <class TTRAIN, class TCLASSIFY>
165 {
166  m_isInitialized = false;
167  m_KMeans.clear();
168 }
169 
170 template <class TTRAIN, class TCLASSIFY>
172 {
173 }
174 
175 template <class TTRAIN, class TCLASSIFY>
177 {
178  m_isInitialized = false;
179 
180  m_KMeans.clear();
181  m_parameters = params;
182 
183  if (m_parameters.m_K < 2)
184  {
185  throw te::cl::Exception(TE_TR("The value of K must be at least 2."));
186  return false;
187  }
190  if (m_parameters.m_epsilon < 0)
191  m_parameters.m_epsilon = 0.0000000001;
192 
193  m_isInitialized = true;
194 
195  return true;
196 }
197 
198 template<class TTRAIN, class TCLASSIFY>
199 bool te::cl::KMeans<TTRAIN, TCLASSIFY>::train(TTRAIN& itBegin, TTRAIN& itEnd,
200  const std::vector<unsigned int>& attributesIndices,
201  const std::vector<unsigned int>& labels,
202  const bool enableProgressInterface) throw(te::cl::Exception)
203 {
204  std::vector<double> maxValues;
205  std::vector<double> minValues;
206 
207  for (unsigned int i = 0; i < attributesIndices.size(); i++)
208  {
209  maxValues.push_back(std::numeric_limits<double>::min());
210  minValues.push_back(std::numeric_limits<double>::max());
211  }
212 
213 // find maximum and minimum values for each attribute
214  std::vector<unsigned int> tmpClassification;
215  TTRAIN it = itBegin;
216  while(it != itEnd)
217  {
218  tmpClassification.push_back(0);
219  for (unsigned int i = 0; i < attributesIndices.size(); i++)
220  {
221  if ((*it)[attributesIndices[i]] > maxValues[i])
222  maxValues[i] = (*it)[attributesIndices[i]];
223  if ((*it)[attributesIndices[i]] < minValues[i])
224  minValues[i] = (*it)[attributesIndices[i]];
225  }
226  ++it;
227  }
228 
229  srand((unsigned) time(0));
230 // starting random K means
231  m_KMeans.clear();
232  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
233  {
234  std::vector<double> newMean;
235  for (unsigned int i = 0; i < attributesIndices.size(); i++)
236  {
237  newMean.push_back(
238  ( maxValues[i] == minValues[i] )
239  ?
240  ( ((double)rand()) * maxValues[i] / ((double)RAND_MAX) )
241  :
242  (
243  ( ( (double)rand() ) / ((double)RAND_MAX) )
244  *
245  ( maxValues[i] - minValues[i] )
246  )
247  );
248  }
249 
250  m_KMeans.push_back(newMean);
251  }
252 
253 // estimate K means
254  std::vector<double> values;
255  std::map<unsigned int, std::vector<double> > tmpValues;
256  std::vector<unsigned int> tmpNs;
257  unsigned int tmpClass;
258  double distanceKMeans;
259  double a_minus_b;
260  std::vector<std::vector<double> > oldKMeans;
261  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
262  {
263  tmpValues[k] = std::vector<double>(attributesIndices.size());
264  tmpNs.push_back(0);
265  }
266 
268  for (unsigned int t = 0; t < m_parameters.m_maxIterations; t++)
269  {
270  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
271  {
272  for (unsigned int i = 0; i < attributesIndices.size(); i++)
273  tmpValues[k][i] = 0.0;
274  tmpNs[k] = 0;
275  }
276 
277  // classifying input data according to actual K means
278  it = itBegin;
279  while(it != itEnd)
280  {
281  values.clear();
282  for (unsigned int i = 0; i < attributesIndices.size(); i++)
283  values.push_back((*it)[attributesIndices[i]]);
284  tmpClass = getClassification(values);
285  for (unsigned int i = 0; i < attributesIndices.size(); i++)
286  tmpValues[tmpClass][i] += values[i];
287  tmpNs[tmpClass]++;
288  ++it;
289  }
290 
291  oldKMeans = m_KMeans;
292 // recomputing K means
293  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
294  for (unsigned int i = 0; i < attributesIndices.size(); i++)
295  if (tmpNs[k] > 0)
296  m_KMeans[k][i] = (double) (tmpValues[k][i] / (double) tmpNs[k]);
297 
298 // checking convergence
299  distanceKMeans = 0.0;
300  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
301  for (unsigned int i = 0; i < attributesIndices.size(); i++)
302  {
303  a_minus_b = m_KMeans[k][i] - oldKMeans[k][i];
304  distanceKMeans += a_minus_b * a_minus_b;
305  }
306  distanceKMeans = sqrt(distanceKMeans);
307  if (distanceKMeans < m_parameters.m_epsilon)
308  break;
309 
310  task.pulse();
311  }
312 
313  return true;
314 }
315 
316 template<class TTRAIN, class TCLASSIFY>
317 bool te::cl::KMeans<TTRAIN, TCLASSIFY>::classify(TCLASSIFY& itBegin, TCLASSIFY& itEnd,
318  const std::vector<unsigned int>& attributesIndices,
319  std::vector<unsigned int>& classification,
320  const bool enableProgressInterface) throw(te::cl::Exception)
321 {
322  TCLASSIFY it = itBegin;
323  std::vector<double> values(attributesIndices.size());
324  classification.clear();
325 
326 // count number of elements to be classified
327  unsigned int N = 0;
328  while(it != itEnd)
329  {
330  N++;
331  ++it;
332  }
333  classification.resize(N);
334 
335 // classify elements
336  te::common::TaskProgress task(TE_TR("K-Means algorithm - classifying data"), te::common::TaskProgress::UNDEFINED, N);
337  unsigned j = 0;
338  it = itBegin;
339  while(it != itEnd)
340  {
341  for (unsigned int i = 0; i < attributesIndices.size(); i++)
342  values[i] = (*it)[attributesIndices[i]];
343  classification[j++] = getClassification(values);
344 
345  ++it;
346  task.pulse();
347  }
348 
349  return true;
350 }
351 
352 template<class TTRAIN, class TCLASSIFY>
353 unsigned int te::cl::KMeans<TTRAIN, TCLASSIFY>::getClassification(std::vector<double> values)
354 {
355  double minDistance = std::numeric_limits<double>::max();
356  double distance;
357  unsigned int clusterNumber = 0;
358 
359  for (unsigned int k = 1; k <= m_parameters.m_K; k++)
360  {
361  distance = GetEuclideanDistance(m_KMeans[k], values);
362  if (distance < minDistance)
363  {
364  minDistance = distance;
365  clusterNumber = k;
366  }
367  }
368 
369  return clusterNumber;
370 }
371 
372 #endif // __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
unsigned int m_maxIterations
The maximum of iterations to perform if convergence is not achieved.
Definition: KMeans.h:77
bool initialize(const Parameters &params)
Definition: KMeans.h:176
Base exception class for plugin module.
Definition: Exception.h:42
double m_epsilon
The stop criteria. When the clusters change in a value smaller then epsilon, the convergence is achie...
Definition: KMeans.h:78
void reset()
Clear all internal allocated resources and reset the parameters instance to its initial state...
Definition: KMeans.h:149
This class can be used to inform the progress of a task.
Definition: TaskProgress.h:53
const Parameters & operator=(const Parameters &params)
Definition: KMeans.h:137
Parameters m_parameters
Internal execution parameters.
Definition: KMeans.h:118
bool classify(TCLASSIFY &itBegin, TCLASSIFY &itEnd, const std::vector< unsigned int > &attributesIndices, std::vector< unsigned int > &classification, const bool enableProgressInterface)
Definition: KMeans.h:317
Classifier Parameters.
Definition: KMeans.h:72
#define TE_TR(message)
It marks a string in order to get translated.
Definition: Translator.h:264
Configuration flags for the Terrralib Classification module.
std::vector< std::vector< double > > m_KMeans
The vector of K means.
Definition: KMeans.h:117
unsigned int m_K
The number of clusters (means) to detect in image.
Definition: KMeans.h:76
TerraLib.
void pulse()
Calls setCurrentStep() function using getCurrentStep() + 1.
KMeans strategy for classification. Step-by-step:
Definition: KMeans.h:63
Utility functions for Classification.
Abstract parameters base interface.
TECLEXPORT double GetEuclideanDistance(std::vector< double > v1, std::vector< double > v2)
Computes euclidean distance between two double vectors.
AbstractParameters * clone() const
Create a clone copy of this instance.
Definition: KMeans.h:157
An exception class for the Classification module.
bool m_isInitialized
True if this instance is initialized.
Definition: KMeans.h:116
unsigned int getClassification(std::vector< double > values)
Definition: KMeans.h:353
bool train(TTRAIN &itBegin, TTRAIN &itEnd, const std::vector< unsigned int > &attributesIndices, const std::vector< unsigned int > &labels, const bool enableProgressInterface)
Definition: KMeans.h:199