KMeans.h
Go to the documentation of this file.
1 /* Copyright (C) 2008 National Institute For Space Research (INPE) - Brazil.
2 
3  This file is part of the TerraLib - a Framework for building GIS enabled applications.
4 
5  TerraLib is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published by
7  the Free Software Foundation, either version 3 of the License,
8  or (at your option) any later version.
9 
10  TerraLib is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with TerraLib. See COPYING. If not, write to
17  TerraLib Team at <terralib-team@terralib.org>.
18  */
19 
20 /*!
21  \file terralib/classification/KMeans.h
22 
23  \brief KMeans strategy for classification.
24 */
25 
26 #ifndef __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
27 #define __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
28 
29 // TerraLib
30 #include "../common/AbstractParameters.h"
31 #include "../common/progress/TaskProgress.h"
32 #include "../raster/Grid.h"
33 #include "Config.h"
34 #include "Exception.h"
35 #include "Utils.h"
36 
37 // STL
38 #include <iostream>
39 #include <cmath>
40 #include <stdlib.h>
41 #include <map>
42 #include <vector>
43 
44 namespace te
45 {
46  namespace cl
47  {
48  /*!
49  \class KMeans
50 
51  \brief KMeans strategy for classification.
52  Step-by-step:
53  1. Define a random solution, creating K values of mean.
54  2. Classify each input value according to the closest mean.
55  3. Calculate new means using the classified pixels.
56  4. Go back to step 2, or stop depending on the following checks:
57  4.1. Check if the new means are different from the previous iteration,
58  using the convergence threshold.
59  4.2. Check if maximum number of iterations has achieved.
60  */
61  template<class TTRAIN, class TCLASSIFY>
62  class KMeans
63  {
64  public:
65 
66  /*!
67  \class Parameters
68 
69  \brief Classifier Parameters
70  */
72  {
73  public:
74 
75  unsigned int m_K; //!< The number of clusters (means) to detect in image.
76  unsigned int m_maxIterations; //!< The maximum of iterations to perform if convergence is not achieved.
77  double m_epsilon; //!< The stop criteria. When the clusters change in a value smaller then epsilon, the convergence is achieved.
78 
79  Parameters();
80 
81  ~Parameters();
82 
83  //overload
84  const Parameters& operator=(const Parameters& params);
85 
86  //overload
87  void reset() throw(te::cl::Exception);
88 
89  //overload
90  AbstractParameters* clone() const;
91  };
92 
93  public:
94 
95  KMeans();
96 
97  ~KMeans();
98 
99  bool initialize(const Parameters& params) throw(te::cl::Exception);
100 
101  bool train(TTRAIN& itBegin, TTRAIN& itEnd,
102  const std::vector<unsigned int>& attributesIndices,
103  const std::vector<unsigned int>& labels,
104  const bool enableProgressInterface) throw(te::cl::Exception);
105 
106  bool classify(TCLASSIFY& itBegin, TCLASSIFY& itEnd,
107  const std::vector<unsigned int>& attributesIndices,
108  std::vector<unsigned int>& classification,
109  const bool enableProgressInterface) throw(te::cl::Exception);
110 
111  protected:
112 
113  unsigned int getClassification(std::vector<double> values);
114 
115  bool m_isInitialized; //!< True if this instance is initialized.
116  std::vector<std::vector<double> > m_KMeans; //!< The vector of K means.
117  Parameters m_parameters; //!< Internal execution parameters.
118  };
119 
120  } // end namespace cl
121 } // end namespace te
122 
123 // class of Parameters
124 template <class TTRAIN, class TCLASSIFY>
125 te::cl::KMeans<TTRAIN, TCLASSIFY>::Parameters::Parameters()
126 {
127  reset();
128 }
129 
130 template <class TTRAIN, class TCLASSIFY>
132 {
133 }
134 
135 template <class TTRAIN, class TCLASSIFY>
137 {
138  reset();
139 
140  m_K = rhs.m_K;
142  m_epsilon = rhs.m_epsilon;
143 
144  return *this;
145 }
146 
147 template <class TTRAIN, class TCLASSIFY>
149 {
150  m_K = 0;
151  m_maxIterations = 0;
152  m_epsilon = 0.0;
153 }
154 
155 template <class TTRAIN, class TCLASSIFY>
157 {
159 }
160 
161 // class KMeans Strategy
162 template <class TTRAIN, class TCLASSIFY>
164 {
165  m_isInitialized = false;
166  m_KMeans.clear();
167 }
168 
169 template <class TTRAIN, class TCLASSIFY>
171 {
172 }
173 
174 template <class TTRAIN, class TCLASSIFY>
176 {
177  m_isInitialized = false;
178 
179  m_KMeans.clear();
180  m_parameters = params;
181 
182  if (m_parameters.m_K < 2)
183  {
184  throw te::cl::Exception(TE_TR("The value of K must be at least 2."));
185  return false;
186  }
189  if (m_parameters.m_epsilon < 0)
190  m_parameters.m_epsilon = 0.0000000001;
191 
192  m_isInitialized = true;
193 
194  return true;
195 }
196 
197 template<class TTRAIN, class TCLASSIFY>
198 bool te::cl::KMeans<TTRAIN, TCLASSIFY>::train(TTRAIN& itBegin, TTRAIN& itEnd,
199  const std::vector<unsigned int>& attributesIndices,
200  const std::vector<unsigned int>& labels,
201  const bool enableProgressInterface) throw(te::cl::Exception)
202 {
203  std::vector<double> maxValues;
204  std::vector<double> minValues;
205 
206  for (unsigned int i = 0; i < attributesIndices.size(); i++)
207  {
208  maxValues.push_back(std::numeric_limits<double>::min());
209  minValues.push_back(std::numeric_limits<double>::max());
210  }
211 
212 // find maximum and minimum values for each attribute
213  std::vector<unsigned int> tmpClassification;
214  TTRAIN it = itBegin;
215  while(it != itEnd)
216  {
217  tmpClassification.push_back(0);
218  for (unsigned int i = 0; i < attributesIndices.size(); i++)
219  {
220  if ((*it)[attributesIndices[i]] > maxValues[i])
221  maxValues[i] = (*it)[attributesIndices[i]];
222  if ((*it)[attributesIndices[i]] < minValues[i])
223  minValues[i] = (*it)[attributesIndices[i]];
224  }
225  ++it;
226  }
227 
228  srand((unsigned) time(0));
229 // starting random K means
230  m_KMeans.clear();
231  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
232  {
233  std::vector<double> newMean;
234  for (unsigned int i = 0; i < attributesIndices.size(); i++)
235  {
236  newMean.push_back(
237  ( maxValues[i] == minValues[i] )
238  ?
239  ( ((double)rand()) * maxValues[i] / ((double)RAND_MAX) )
240  :
241  (
242  ( ( (double)rand() ) / ((double)RAND_MAX) )
243  *
244  ( maxValues[i] - minValues[i] )
245  )
246  );
247  }
248 
249  m_KMeans.push_back(newMean);
250  }
251 
252 // estimate K means
253  std::vector<double> values;
254  std::map<unsigned int, std::vector<double> > tmpValues;
255  std::vector<unsigned int> tmpNs;
256  unsigned int tmpClass;
257  double distanceKMeans;
258  double a_minus_b;
259  std::vector<std::vector<double> > oldKMeans;
260  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
261  {
262  tmpValues[k] = std::vector<double>(attributesIndices.size());
263  tmpNs.push_back(0);
264  }
265 
267  for (unsigned int t = 0; t < m_parameters.m_maxIterations; t++)
268  {
269  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
270  {
271  for (unsigned int i = 0; i < attributesIndices.size(); i++)
272  tmpValues[k][i] = 0.0;
273  tmpNs[k] = 0;
274  }
275 
276  // classifying input data according to actual K means
277  it = itBegin;
278  while(it != itEnd)
279  {
280  values.clear();
281  for (unsigned int i = 0; i < attributesIndices.size(); i++)
282  values.push_back((*it)[attributesIndices[i]]);
283  tmpClass = getClassification(values);
284  for (unsigned int i = 0; i < attributesIndices.size(); i++)
285  tmpValues[tmpClass][i] += values[i];
286  tmpNs[tmpClass]++;
287  ++it;
288  }
289 
290  oldKMeans = m_KMeans;
291 // recomputing K means
292  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
293  for (unsigned int i = 0; i < attributesIndices.size(); i++)
294  if (tmpNs[k] > 0)
295  m_KMeans[k][i] = (double) (tmpValues[k][i] / (double) tmpNs[k]);
296 
297 // checking convergence
298  distanceKMeans = 0.0;
299  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
300  for (unsigned int i = 0; i < attributesIndices.size(); i++)
301  {
302  a_minus_b = m_KMeans[k][i] - oldKMeans[k][i];
303  distanceKMeans += a_minus_b * a_minus_b;
304  }
305  distanceKMeans = sqrt(distanceKMeans);
306  if (distanceKMeans < m_parameters.m_epsilon)
307  break;
308 
309  task.pulse();
310  }
311 
312  return true;
313 }
314 
315 template<class TTRAIN, class TCLASSIFY>
316 bool te::cl::KMeans<TTRAIN, TCLASSIFY>::classify(TCLASSIFY& itBegin, TCLASSIFY& itEnd,
317  const std::vector<unsigned int>& attributesIndices,
318  std::vector<unsigned int>& classification,
319  const bool enableProgressInterface) throw(te::cl::Exception)
320 {
321  TCLASSIFY it = itBegin;
322  std::vector<double> values(attributesIndices.size());
323  classification.clear();
324 
325 // count number of elements to be classified
326  unsigned int N = 0;
327  while(it != itEnd)
328  {
329  N++;
330  ++it;
331  }
332  classification.resize(N);
333 
334 // classify elements
335  te::common::TaskProgress task(TE_TR("K-Means algorithm - classifying data"), te::common::TaskProgress::UNDEFINED, N);
336  unsigned j = 0;
337  it = itBegin;
338  while(it != itEnd)
339  {
340  for (unsigned int i = 0; i < attributesIndices.size(); i++)
341  values[i] = (*it)[attributesIndices[i]];
342  classification[j++] = getClassification(values);
343 
344  ++it;
345  task.pulse();
346  }
347 
348  return true;
349 }
350 
351 template<class TTRAIN, class TCLASSIFY>
352 unsigned int te::cl::KMeans<TTRAIN, TCLASSIFY>::getClassification(std::vector<double> values)
353 {
354  double minDistance = std::numeric_limits<double>::max();
355  double distance;
356  unsigned int clusterNumber = 0;
357 
358  for (unsigned int k = 1; k <= m_parameters.m_K; k++)
359  {
360  distance = GetEuclideanDistance(m_KMeans[k], values);
361  if (distance < minDistance)
362  {
363  minDistance = distance;
364  clusterNumber = k;
365  }
366  }
367 
368  return clusterNumber;
369 }
370 
371 #endif // __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
unsigned int m_maxIterations
The maximum of iterations to perform if convergence is not achieved.
Definition: KMeans.h:76
bool initialize(const Parameters &params)
Definition: KMeans.h:175
Base exception class for plugin module.
Definition: Exception.h:42
double m_epsilon
The stop criteria. When the clusters change in a value smaller then epsilon, the convergence is achie...
Definition: KMeans.h:77
void reset()
Clear all internal allocated resources and reset the parameters instance to its initial state...
Definition: KMeans.h:148
This class can be used to inform the progress of a task.
Definition: TaskProgress.h:53
const Parameters & operator=(const Parameters &params)
Definition: KMeans.h:136
Parameters m_parameters
Internal execution parameters.
Definition: KMeans.h:117
bool classify(TCLASSIFY &itBegin, TCLASSIFY &itEnd, const std::vector< unsigned int > &attributesIndices, std::vector< unsigned int > &classification, const bool enableProgressInterface)
Definition: KMeans.h:316
Classifier Parameters.
Definition: KMeans.h:71
#define TE_TR(message)
It marks a string in order to get translated.
Definition: Translator.h:242
Configuration flags for the Terrralib Classification module.
std::vector< std::vector< double > > m_KMeans
The vector of K means.
Definition: KMeans.h:116
unsigned int m_K
The number of clusters (means) to detect in image.
Definition: KMeans.h:75
TerraLib.
void pulse()
Calls setCurrentStep() function using getCurrentStep() + 1.
KMeans strategy for classification. Step-by-step:
Definition: KMeans.h:62
Utility functions for Classification.
Abstract parameters base interface.
TECLEXPORT double GetEuclideanDistance(std::vector< double > v1, std::vector< double > v2)
Computes euclidean distance between two double vectors.
AbstractParameters * clone() const
Create a clone copy of this instance.
Definition: KMeans.h:156
An exception class for the Classification module.
bool m_isInitialized
True if this instance is initialized.
Definition: KMeans.h:115
unsigned int getClassification(std::vector< double > values)
Definition: KMeans.h:352
bool train(TTRAIN &itBegin, TTRAIN &itEnd, const std::vector< unsigned int > &attributesIndices, const std::vector< unsigned int > &labels, const bool enableProgressInterface)
Definition: KMeans.h:198