KMeans.h
Go to the documentation of this file.
1 /* Copyright (C) 2008 National Institute For Space Research (INPE) - Brazil.
2 
3  This file is part of the TerraLib - a Framework for building GIS enabled applications.
4 
5  TerraLib is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published by
7  the Free Software Foundation, either version 3 of the License,
8  or (at your option) any later version.
9 
10  TerraLib is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with TerraLib. See COPYING. If not, write to
17  TerraLib Team at <terralib-team@terralib.org>.
18  */
19 
20 /*!
21  \file terralib/classification/KMeans.h
22 
23  \brief KMeans strategy for classification.
24 */
25 
26 #ifndef __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
27 #define __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
28 
29 // TerraLib
30 #include "../common/AbstractParameters.h"
31 #include "../common/progress/TaskProgress.h"
32 #include "../raster/Grid.h"
33 #include "Config.h"
34 #include "Exception.h"
35 #include "Utils.h"
36 
37 // STL
38 #include <iostream>
39 #include <math.h>
40 #include <stdlib.h>
41 #include <map>
42 #include <vector>
43 
44 namespace te
45 {
46  namespace cl
47  {
48  /*!
49  \class KMeans
50 
51  \brief KMeans strategy for classification.
52  Step-by-step:
53  1. Define a random solution, creating K values of mean.
54  2. Classify each input value according to the closest mean.
55  3. Calculate new means using the classified pixels.
56  4. Go back to step 2, or stop depending on the following checks:
57  4.1. Check if the new means are different from the previous iteration,
58  using the convergence threshold.
59  4.2. Check if maximum number of iterations has achieved.
60  */
61  template<class TTRAIN, class TCLASSIFY>
62  class KMeans
63  {
64  public:
65 
66  /*!
67  \class Parameters
68 
69  \brief Classifier Parameters
70  */
72  {
73  public:
74 
75  unsigned int m_K; //!< The number of clusters (means) to detect in image.
76  unsigned int m_maxIterations; //!< The maximum of iterations to perform if convergence is not achieved.
77  double m_epsilon; //!< The stop criteria. When the clusters change in a value smaller then epsilon, the convergence is achieved.
78 
79  Parameters();
80 
81  ~Parameters();
82 
83  //overload
84  const Parameters& operator=(const Parameters& params);
85 
86  //overload
87  void reset() throw(te::cl::Exception);
88 
89  //overload
90  AbstractParameters* clone() const;
91  };
92 
93  public:
94 
95  KMeans();
96 
97  ~KMeans();
98 
99  bool initialize(const Parameters& params) throw(te::cl::Exception);
100 
101  bool train(TTRAIN& itBegin, TTRAIN& itEnd,
102  const std::vector<unsigned int>& attributesIndices,
103  const std::vector<unsigned int>& labels,
104  const bool enableProgressInterface) throw(te::cl::Exception);
105 
106  bool classify(TCLASSIFY& itBegin, TCLASSIFY& itEnd,
107  const std::vector<unsigned int>& attributesIndices,
108  std::vector<unsigned int>& classification,
109  const bool enableProgressInterface) throw(te::cl::Exception);
110 
111  protected:
112 
113  unsigned int getClassification(std::vector<double> values);
114 
115  bool m_isInitialized; //!< True if this instance is initialized.
116  std::vector<std::vector<double> > m_KMeans; //!< The vector of K means.
117  Parameters m_parameters; //!< Internal execution parameters.
118  };
119 
120  } // end namespace cl
121 } // end namespace te
122 
123 // class of Parameters
124 template <class TTRAIN, class TCLASSIFY>
125 te::cl::KMeans<TTRAIN, TCLASSIFY>::Parameters::Parameters()
126 {
127  reset();
128 }
129 
130 template <class TTRAIN, class TCLASSIFY>
132 {
133 }
134 
135 template <class TTRAIN, class TCLASSIFY>
137 {
138  reset();
139 
140  m_K = rhs.m_K;
141  m_maxIterations = rhs.m_maxIterations;
142  m_epsilon = rhs.m_epsilon;
143 
144  return *this;
145 }
146 
147 template <class TTRAIN, class TCLASSIFY>
149 {
150  m_K = 0;
151  m_maxIterations = 0;
152  m_epsilon = 0.0;
153 }
154 
155 template <class TTRAIN, class TCLASSIFY>
157 {
159 }
160 
161 // class KMeans Strategy
162 template <class TTRAIN, class TCLASSIFY>
164 {
165  m_isInitialized = false;
166  m_KMeans.clear();
167 }
168 
169 template <class TTRAIN, class TCLASSIFY>
171 {
172 }
173 
174 template <class TTRAIN, class TCLASSIFY>
176 {
177  m_isInitialized = false;
178 
179  m_KMeans.clear();
180  m_parameters = params;
181 
182  if (m_parameters.m_K < 2)
183  {
184  throw te::cl::Exception(TE_TR("The value of K must be at least 2."));
185  return false;
186  }
189  if (m_parameters.m_epsilon < 0)
190  m_parameters.m_epsilon = 0.0000000001;
191 
192  m_isInitialized = true;
193 
194  return true;
195 }
196 
197 template<class TTRAIN, class TCLASSIFY>
198 bool te::cl::KMeans<TTRAIN, TCLASSIFY>::train(TTRAIN& itBegin, TTRAIN& itEnd,
199  const std::vector<unsigned int>& attributesIndices,
200  const std::vector<unsigned int>& labels,
201  const bool enableProgressInterface) throw(te::cl::Exception)
202 {
203  std::vector<double> maxValues;
204  std::vector<double> minValues;
205 
206  for (unsigned int i = 0; i < attributesIndices.size(); i++)
207  {
208  maxValues.push_back(std::numeric_limits<double>::min());
209  minValues.push_back(std::numeric_limits<double>::max());
210  }
211 
212 // find maximum and minimum values for each attribute
213  std::vector<unsigned int> tmpClassification;
214  TTRAIN it = itBegin;
215  while(it != itEnd)
216  {
217  tmpClassification.push_back(0);
218  for (unsigned int i = 0; i < attributesIndices.size(); i++)
219  {
220  if ((*it)[attributesIndices[i]] > maxValues[i])
221  maxValues[i] = (*it)[attributesIndices[i]];
222  if ((*it)[attributesIndices[i]] < minValues[i])
223  minValues[i] = (*it)[attributesIndices[i]];
224  }
225  ++it;
226  }
227 
228  srand((unsigned) time(0));
229 // starting random K means
230  m_KMeans.clear();
231  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
232  {
233  std::vector<double> newMean;
234  for (unsigned int i = 0; i < attributesIndices.size(); i++)
235  newMean.push_back(rand() % (int) (maxValues[i] - minValues[i]));
236 
237  m_KMeans.push_back(newMean);
238  }
239 
240 // estimate K means
241  std::vector<double> values;
242  std::map<unsigned int, std::vector<double> > tmpValues;
243  std::vector<unsigned int> tmpNs;
244  unsigned int tmpClass;
245  double distanceKMeans;
246  double a_minus_b;
247  std::vector<std::vector<double> > oldKMeans;
248  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
249  {
250  tmpValues[k] = std::vector<double>(attributesIndices.size());
251  tmpNs.push_back(0);
252  }
253 
255  for (unsigned int t = 0; t < m_parameters.m_maxIterations; t++)
256  {
257  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
258  {
259  for (unsigned int i = 0; i < attributesIndices.size(); i++)
260  tmpValues[k][i] = 0.0;
261  tmpNs[k] = 0;
262  }
263 
264  // classifying input data according to actual K means
265  it = itBegin;
266  while(it != itEnd)
267  {
268  values.clear();
269  for (unsigned int i = 0; i < attributesIndices.size(); i++)
270  values.push_back((*it)[attributesIndices[i]]);
271  tmpClass = getClassification(values);
272  for (unsigned int i = 0; i < attributesIndices.size(); i++)
273  tmpValues[tmpClass][i] += values[i];
274  tmpNs[tmpClass]++;
275  ++it;
276  }
277 
278  oldKMeans = m_KMeans;
279 // recomputing K means
280  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
281  for (unsigned int i = 0; i < attributesIndices.size(); i++)
282  if (tmpNs[k] > 0)
283  m_KMeans[k][i] = (double) (tmpValues[k][i] / (double) tmpNs[k]);
284 
285 // checking convergence
286  distanceKMeans = 0.0;
287  for (unsigned int k = 0; k <= m_parameters.m_K; k++)
288  for (unsigned int i = 0; i < attributesIndices.size(); i++)
289  {
290  a_minus_b = m_KMeans[k][i] - oldKMeans[k][i];
291  distanceKMeans += a_minus_b * a_minus_b;
292  }
293  distanceKMeans = sqrt(distanceKMeans);
294  if (distanceKMeans < m_parameters.m_epsilon)
295  break;
296 
297  task.pulse();
298  }
299 
300  return true;
301 }
302 
303 template<class TTRAIN, class TCLASSIFY>
304 bool te::cl::KMeans<TTRAIN, TCLASSIFY>::classify(TCLASSIFY& itBegin, TCLASSIFY& itEnd,
305  const std::vector<unsigned int>& attributesIndices,
306  std::vector<unsigned int>& classification,
307  const bool enableProgressInterface) throw(te::cl::Exception)
308 {
309  TCLASSIFY it = itBegin;
310  std::vector<double> values(attributesIndices.size());
311  classification.clear();
312 
313 // count number of elements to be classified
314  unsigned int N = 0;
315  while(it != itEnd)
316  {
317  N++;
318  ++it;
319  }
320  classification.resize(N);
321 
322 // classify elements
323  te::common::TaskProgress task(TE_TR("K-Means algorithm - classifying data"), te::common::TaskProgress::UNDEFINED, N);
324  unsigned j = 0;
325  it = itBegin;
326  while(it != itEnd)
327  {
328  for (unsigned int i = 0; i < attributesIndices.size(); i++)
329  values[i] = (*it)[attributesIndices[i]];
330  classification[j++] = getClassification(values);
331 
332  ++it;
333  task.pulse();
334  }
335 
336  return true;
337 }
338 
339 template<class TTRAIN, class TCLASSIFY>
340 unsigned int te::cl::KMeans<TTRAIN, TCLASSIFY>::getClassification(std::vector<double> values)
341 {
342  double minDistance = std::numeric_limits<double>::max();
343  double distance;
344  unsigned int clusterNumber = 0;
345 
346  for (unsigned int k = 1; k <= m_parameters.m_K; k++)
347  {
348  distance = GetEuclideanDistance(m_KMeans[k], values);
349  if (distance < minDistance)
350  {
351  minDistance = distance;
352  clusterNumber = k;
353  }
354  }
355 
356  return clusterNumber;
357 }
358 
359 #endif // __TERRALIB_CLASSIFICATION_INTERNAL_KMEANS_H
unsigned int m_maxIterations
The maximum of iterations to perform if convergence is not achieved.
Definition: KMeans.h:76
bool initialize(const Parameters &params)
Definition: KMeans.h:175
Base exception class for plugin module.
Definition: Exception.h:42
double m_epsilon
The stop criteria. When the clusters change in a value smaller then epsilon, the convergence is achie...
Definition: KMeans.h:77
void reset()
Clear all internal allocated resources and reset the parameters instance to its initial state...
Definition: KMeans.h:148
This class can be used to inform the progress of a task.
Definition: TaskProgress.h:53
const Parameters & operator=(const Parameters &params)
Definition: KMeans.h:136
Parameters m_parameters
Internal execution parameters.
Definition: KMeans.h:117
bool classify(TCLASSIFY &itBegin, TCLASSIFY &itEnd, const std::vector< unsigned int > &attributesIndices, std::vector< unsigned int > &classification, const bool enableProgressInterface)
Definition: KMeans.h:304
Classifier Parameters.
Definition: KMeans.h:71
#define TE_TR(message)
It marks a string in order to get translated.
Definition: Translator.h:243
Configuration flags for the Terrralib Classification module.
std::vector< std::vector< double > > m_KMeans
The vector of K means.
Definition: KMeans.h:116
unsigned int m_K
The number of clusters (means) to detect in image.
Definition: KMeans.h:75
URI C++ Library.
void pulse()
Calls setCurrentStep() function using getCurrentStep() + 1.
KMeans strategy for classification. Step-by-step:
Definition: KMeans.h:62
Utility functions for Classification.
Abstract parameters base interface.
TECLEXPORT double GetEuclideanDistance(std::vector< double > v1, std::vector< double > v2)
Computes euclidean distance between two double vectors.
An exception class for the Classification module.
bool m_isInitialized
True if this instance is initialized.
Definition: KMeans.h:115
AbstractParameters * clone() const
Create a clone copy of this instance.
Definition: KMeans.h:156
unsigned int getClassification(std::vector< double > values)
Definition: KMeans.h:340
bool train(TTRAIN &itBegin, TTRAIN &itEnd, const std::vector< unsigned int > &attributesIndices, const std::vector< unsigned int > &labels, const bool enableProgressInterface)
Definition: KMeans.h:198