SummaryFunctions.cpp
Go to the documentation of this file.
1 /* Copyright (C) 2008 National Institute For Space Research (INPE) - Brazil.
2 
3  This file is part of the TerraLib - a Framework for building GIS enabled applications.
4 
5  TerraLib is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published by
7  the Free Software Foundation, either version 3 of the License,
8  or (at your option) any later version.
9 
10  TerraLib is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with TerraLib. See COPYING. If not, write to
17  TerraLib Team at <terralib-team@terralib.org>.
18  */
19 
20 /*!
21  \file SummaryFunctions.cpp
22 
23  \brief Statistical summary functions.
24  */
25 
26 //Terralib
27 #include "../../dataaccess/query_h.h"
28 #include "../../dataaccess/dataset/DataSet.h"
29 #include "../../dataaccess/dataset/DataSetType.h"
30 #include "../../dataaccess/datasource/DataSource.h"
31 #include "../../dataaccess/datasource/DataSourceCapabilities.h"
32 #include "../../dataaccess/datasource/DataSourceTransactor.h"
33 #include "../../dataaccess/utils/Utils.h"
34 #include "../../dataaccess/query_h.h"
35 #include "../../datatype/Property.h"
36 #include "../../maptools/AbstractLayer.h"
37 #include "../../maptools/DataSetLayer.h"
38 #include "Config.h"
39 #include "Exception.h"
40 #include "SummaryFunctions.h"
41 #include "Utils.h"
42 
43 // BOOST
44 #include <boost/lexical_cast.hpp>
45 
46 //STL
47 #include <algorithm>
48 #include <map>
49 #include <numeric>
50 #include <vector>
51 #include <memory>
52 
53 void te::stat::GetStringStatisticalSummary(std::vector<std::string>& values, te::stat::StringStatisticalSummary& ss, const std::string& nulValue)
54 {
55  if (values.empty())
56  return;
57 
58  std::vector<std::string> validValues;
59  for (size_t i=0; i<values.size(); ++i)
60  {
61  if (values[i] != nulValue)
62  validValues.push_back(values[i]);
63  }
64  GetStringStatisticalSummary(validValues, ss);
65  ss.m_count = values.size();
66  ss.m_validCount = validValues.size();
67 }
68 
70 {
71  if (values.empty())
72  return;
73 
74  std::sort(values.begin(), values.end());
75 
76  ss.m_minVal = *values.begin();
77  ss.m_maxVal = values[values.size() - 1];
78 
79  ss.m_count = values.size();
80 
81  for(std::size_t i = 0; i < values.size(); ++i)
82  {
83  if(!values[i].empty())
84  {
85  ++ss.m_validCount;
86  }
87  }
88 
89  ss.m_mode = Mode(values);
90 }
91 
92 void te::stat::GetNumericStatisticalSummary(std::vector<double>& values, te::stat::NumericStatisticalSummary& ss, double nulValue)
93 {
94  if (values.empty())
95  return;
96 
97  std::vector<double> validValues;
98  for (size_t i=0; i<values.size(); ++i)
99  {
100  if (values[i] != nulValue)
101  validValues.push_back(values[i]);
102  }
103  GetNumericStatisticalSummary(validValues, ss);
104  ss.m_count = values.size();
105  ss.m_validCount = validValues.size();
106 }
107 
109 {
110  if (values.empty())
111  return;
112 
113  std::sort(values.begin(), values.end());
114 
115  ss.m_minVal = *values.begin();
116  ss.m_maxVal = values[values.size() - 1];
117  ss.m_count = values.size();
118  ss.m_validCount = values.size();
119 
120  for(std::size_t i = 0; i < values.size(); ++i)
121  {
122  ss.m_sum += values[i];
123  }
124 
125  ss.m_mean = ss.m_sum/ss.m_count;
126 
127  for(int i = 0; i < ss.m_count; ++i)
128  {
129  double v= values[i];
130  ss.m_variance += pow((v-ss.m_mean),2);
131  ss.m_skewness += pow((v-ss.m_mean),3);
132  ss.m_kurtosis += pow((v-ss.m_mean),4);
133  }
134 
135  ss.m_variance /= ss.m_count;
136  ss.m_stdDeviation = pow(ss.m_variance, 0.5);
137  ss.m_skewness /= ss.m_count;
138  ss.m_skewness /= pow(ss.m_stdDeviation, 3);
139  ss.m_kurtosis /= ss.m_count;
140  ss.m_kurtosis /= pow(ss.m_stdDeviation, 4);
141 
142  ss.m_varCoeff = (100* ss.m_stdDeviation) / ss.m_mean;
143  ss.m_amplitude = ss.m_maxVal - ss.m_minVal;
144 
145  if((ss.m_count % 2) == 0)
146  ss.m_median = (values[(ss.m_count/2)] + values[(ss.m_count/2-1)])/2;
147  else
148  ss.m_median = values[(ss.m_count-1)/2];
149 
150  ss.m_mode = Mode(values);
151 }
152 
153 void te::stat::GetPercentOfEachClassByArea( std::vector<double>& values,
154  double& resolutionX,
155  double& resolutionY,
156  double& area,
158  bool fullIntersection)
159 {
160  if (values.empty())
161  return;
162 
163  std::map<double, double> percentMap;
164 
165  std::sort(values.begin(), values.end());
166 
167  double key = values[0];
168  int count = 1;
169 
170  if (fullIntersection)
171  {
172  for (std::size_t i = 1; i < values.size(); ++i)
173  {
174  if (values[i] == key)
175  {
176  ++count;
177  }
178  else
179  {
180  double percent = (count * 100) / (double)values.size();
181  percentMap.insert(std::pair<double, double>(key, percent));
182  key = values[i];
183  count = 1;
184  }
185  }
186 
187  double percent = (count * 100) / (double)values.size();
188  percentMap.insert(std::pair<double, double>(key, percent));
189  }
190  else
191  {
192  for (std::size_t i = 1; i < values.size(); ++i)
193  {
194  if (values[i] == key)
195  {
196  ++count;
197  }
198  else
199  {
200  double areaIntersection = count*(resolutionX*resolutionY);
201  double percentInter = (areaIntersection / area) * 100;
202  percentMap.insert(std::pair<double, double>(key, percentInter));
203  key = values[i];
204  count = 1;
205  }
206  }
207 
208  double areaIntersection = count*(resolutionX*resolutionY);
209  double percentInter = (areaIntersection / area) * 100;
210  percentMap.insert(std::pair<double, double>(key, percentInter));
211  }
212 
213  ss.m_percentEachClass = percentMap;
214 }
215 
216 std::vector<double> te::stat::Mode(const std::vector<double>& values)
217 {
218  std::vector<double> mode;
219  if (values.empty())
220  return mode;
221 
222  bool found;
223  std::map<double, int> mapMode;
224 
225  for(std::size_t i = 0; i < values.size(); ++i)
226  {
227  found = false;
228 
229  if(!mapMode.empty())
230  {
231  std::map<double, int>::iterator itMode = mapMode.begin();
232 
233  while(itMode != mapMode.end())
234  {
235  if(itMode->first == values[i])
236  {
237  ++itMode->second;
238  found = true;
239  }
240 
241  ++itMode;
242  }
243  if(found == false)
244  {
245  mapMode.insert( std::map<double, int>::value_type( values[i] , 1 ) );
246  }
247  }
248  else
249  mapMode.insert( std::map<double, int>::value_type( values[i] , 1 ) );
250  }
251 
252  std::map<double, int>::iterator itMode = mapMode.begin();
253  int repeat = 0;
254 
255  while(itMode != mapMode.end())
256  {
257  if(itMode->second > 1)
258  {
259  if(repeat < itMode->second)
260  {
261  repeat = itMode->second;
262  mode.clear();
263  mode.push_back(itMode->first);
264  }
265  else if(repeat == itMode->second)
266  {
267  mode.push_back(itMode->first);
268  }
269  }
270 
271  ++itMode;
272  }
273 
274  return mode;
275 }
276 
277 std::string te::stat::Mode(const std::vector<std::string>& values)
278 {
279  if (values.empty())
280  return "";
281 
282  bool found;
283  std::string mode = "";
284  std::map<std::string, int> mapMode;
285 
286  for(std::size_t i = 0; i < values.size(); ++i)
287  {
288  found = false;
289 
290  if(!mapMode.empty())
291  {
292  std::map<std::string, int>::iterator itMode = mapMode.begin();
293 
294  while(itMode != mapMode.end())
295  {
296  if(itMode->first == values[i])
297  {
298  ++itMode->second;
299  found = true;
300  }
301 
302  ++itMode;
303  }
304  if(found == false)
305  {
306  mapMode.insert( std::map<std::string, int>::value_type( values[i] , 1 ) );
307  }
308  }
309  else
310  mapMode.insert( std::map<std::string, int>::value_type( values[i] , 1 ) );
311  }
312 
313  std::map<std::string, int>::iterator itMode = mapMode.begin();
314  int repeat = 0;
315 
316  while(itMode != mapMode.end())
317  {
318  if(repeat < itMode->second)
319  {
320  repeat = itMode->second;
321  mode = itMode->first;
322  }
323 
324  ++itMode;
325  }
326 
327  return mode;
328 }
329 
330 void te::stat::GetStringStatisticalSummaryQuery(const std::string& inDataset,
331  te::da::DataSource* inDatasource,
332  const std::string& propName,
334 {
335  assert(inDatasource);
336 
337  if (!inDatasource->dataSetExists(inDataset))
338  return;
339 
340  const te::da::DataSourceCapabilities dsCapabilities = inDatasource->getCapabilities();
341 
342  if(!dsCapabilities.supportsPreparedQueryAPI())
343  {
344  std::auto_ptr<te::da::DataSet> ds = inDatasource->getDataSet(inDataset);
345 
346  std::vector<std::string> stringVector = te::stat::GetStringData(ds.get(), propName);
347 
348  te::stat::GetStringStatisticalSummary(stringVector, ss);
349  }
350  else
351  {
352  te::da::Fields* fields = new te::da::Fields;
353 
354  te::da::PropertyName* p_name = new te::da::PropertyName(propName);
355 
356  te::da::Expression* e_min = new te::da::Min(p_name);
357  te::da::Field* f_min = new te::da::Field(*e_min, p_name->getName() + "_MIN_VALUE");
358 
359  te::da::Expression* e_max = new te::da::Max(p_name);
360  te::da::Field* f_max = new te::da::Field(*e_max, p_name->getName() + "_MAX_VALUE");
361 
362  te::da::Expression* e_count = new te::da::Count(p_name);
363  te::da::Field* f_count = new te::da::Field(*e_count, p_name->getName() + "_COUNT");
364 
365  te::da::Expression* e_validcount = new te::da::Count(p_name);
366  te::da::Field* f_validcount = new te::da::Field(*e_validcount, p_name->getName() + "_VALID_COUNT");
367 
368  fields->push_back(f_min);
369  fields->push_back(f_max);
370  fields->push_back(f_count);
371  fields->push_back(f_validcount);
372 
373  te::da::FromItem* fromItem = new te::da::DataSetName(inDataset);
374  te::da::From* from = new te::da::From;
375  from->push_back(fromItem);
376 
377  te::da::Select select(fields, from);
378 
379  std::auto_ptr<te::da::DataSet> dsQuery = inDatasource->query(select);
380 
381  if (!dsQuery.get())
382  return;
383 
384  dsQuery->moveFirst();
385 
386  ss.m_minVal = dsQuery->getAsString(1);
387  ss.m_maxVal = dsQuery->getAsString(2);
388  ss.m_count = dsQuery->getInt16(3);
389  ss.m_validCount = dsQuery->getInt16(4);
390 
391  // how to get the mode?
392  }
393 }
394 
395 void te::stat::GetNumericStatisticalSummaryQuery(const std::string& inDataset,
396  te::da::DataSource* inDatasource,
397  const std::string& propName,
399 {
400  assert(inDatasource);
401 
402  if (!inDatasource->dataSetExists(inDataset))
403  return;
404 
405  const te::da::DataSourceCapabilities dsCapabilities = inDatasource->getCapabilities();
406 
407  if(!dsCapabilities.supportsPreparedQueryAPI())
408  {
409  std::auto_ptr<te::da::DataSet> ds = inDatasource->getDataSet(inDataset);
410 
411  std::vector<double> numericVector = te::stat::GetNumericData(ds.get(), propName);
412 
413  te::stat::GetNumericStatisticalSummary(numericVector, ss);
414  }
415  else
416  {
417  te::da::PropertyName* p_name = new te::da::PropertyName(propName);
418 
419  te::da::Expression* e_min = new te::da::Min(p_name);
420  te::da::Field* f_min = new te::da::Field(*e_min, "MIN");
421 
422  te::da::Expression* e_max = new te::da::Max(p_name);
423  te::da::Field* f_max = new te::da::Field(*e_max, "MAX");
424 
425  te::da::Expression* e_count = new te::da::Count(p_name);
426  te::da::Field* f_count = new te::da::Field(*e_count, "COUNT");
427 
428  te::da::Expression* e_sum = new te::da::Sum(p_name);
429  te::da::Field* f_sum = new te::da::Field(*e_sum, "SUM");
430 
431  te::da::Expression* e_mean = new te::da::Avg(p_name);
432  te::da::Field* f_mean = new te::da::Field(*e_mean, "MEAN");
433 
434  te::da::Expression* e_stddev = new te::da::StdDev(p_name);
435  te::da::Field* f_stddev = new te::da::Field(*e_stddev, "STD_DEV");
436 
437  te::da::Expression* e_variance = new te::da::Variance(p_name);
438  te::da::Field* f_variance = new te::da::Field(*e_variance, "VARIANCE");
439 
440  te::da::Expression* e_amplitude = new te::da::Sub(*e_max, *e_min);
441  te::da::Field* f_amplitude = new te::da::Field(*e_amplitude, "AMPLITUDE");
442 
443  te::da::Fields* fields = new te::da::Fields;
444 
445  fields->push_back(f_min);
446  fields->push_back(f_max);
447  fields->push_back(f_count);
448  fields->push_back(f_sum);
449  fields->push_back(f_mean);
450  fields->push_back(f_stddev);
451  fields->push_back(f_variance);
452  fields->push_back(f_amplitude);
453 
454  te::da::FromItem* fromItem = new te::da::DataSetName(inDataset);
455  te::da::From* from = new te::da::From;
456  from->push_back(fromItem);
457 
458  te::da::Select select(fields, from);
459 
460  std::auto_ptr<te::da::DataSet> dsQuery = inDatasource->query(select);
461 
462  if (!dsQuery.get())
463  return;
464 
465  dsQuery->moveFirst();
466 
467  ss.m_minVal = boost::lexical_cast<double>(dsQuery->getAsString(0));
468  ss.m_maxVal = boost::lexical_cast<double>(dsQuery->getAsString(1));
469  ss.m_count = boost::lexical_cast<int>(dsQuery->getAsString(2));
470  ss.m_sum = boost::lexical_cast<double>(dsQuery->getAsString(3));
471  ss.m_mean = boost::lexical_cast<double>(dsQuery->getAsString(4));
472  ss.m_stdDeviation = boost::lexical_cast<double>(dsQuery->getAsString(5));
473  ss.m_variance = boost::lexical_cast<double>(dsQuery->getAsString(6));
474  ss.m_amplitude = boost::lexical_cast<double>(dsQuery->getAsString(7));
475  }
476 }
TESTATEXPORT void GetNumericStatisticalSummary(std::vector< double > &values, te::stat::NumericStatisticalSummary &ss, double nullVal)
const std::string & getName() const
It returns the property name.
Definition: PropertyName.h:80
TESTATEXPORT void GetNumericStatisticalSummaryQuery(const std::string &inDataset, te::da::DataSource *inDatasource, const std::string &propName, te::stat::NumericStatisticalSummary &ss)
A structure to hold the set of statistics from a set of numerical values.
Utility functions for the data access module.
An abstract class that models a source of data in a query.
Definition: FromItem.h:50
The Field class can be used to model an expression that takes part of the output items of a SELECT...
Definition: Field.h:50
A class that models the name of a dataset used in a From clause.
Definition: DataSetName.h:43
TESTATEXPORT void GetStringStatisticalSummaryQuery(const std::string &inDataset, te::da::DataSource *inDatasource, const std::string &propName, te::stat::StringStatisticalSummary &ss)
TESTATEXPORT std::vector< double > Mode(const std::vector< double > &values)
virtual std::auto_ptr< DataSet > query(const Select &q, te::common::TraverseType travType=te::common::FORWARDONLY, const te::common::AccessPolicy accessPolicy=te::common::RAccess)
It executes a query that may return some data using a generic query. This method always returns a dis...
Definition: DataSource.cpp:100
A class that models the name of any property of an object.
Definition: PropertyName.h:50
Count statistical function.
Definition: Count.h:46
A class that represents the known capabilities of a specific data source, i.e. this class informs all...
An abstract class for data providers like a DBMS, Web Services or a regular file. ...
Definition: DataSource.h:118
Avg statistical function.
Definition: Avg.h:46
Min statistical function.
Definition: Min.h:46
std::map< double, double > m_percentEachClass
This is an abstract class that models a query expression.
Definition: Expression.h:47
StdDev statistical function.
Definition: StdDev.h:46
TESTATEXPORT std::vector< double > GetNumericData(te::da::DataSet *dataSet, const std::string propName)
Returns the values of a numeric type property in a vector of values.
Definition: Utils.cpp:170
A set of functions to calculate the statistic summary from a set of values.
TESAEXPORT double Sum(te::sa::GeneralizedProximityMatrix *gpm, int attrIdx)
Function used to calculate sum of a specific attribute from a gpm.
TESTATEXPORT std::vector< std::string > GetStringData(te::da::DataSet *dataSet, const std::string propName)
Returns the values of a string type property in a vector of values.
Definition: Utils.cpp:151
virtual std::auto_ptr< DataSet > getDataSet(const std::string &name, te::common::TraverseType travType=te::common::FORWARDONLY, const te::common::AccessPolicy accessPolicy=te::common::RAccess)
It gets the dataset identified by the given name. This method always returns a disconnected dataset...
Definition: DataSource.cpp:61
boost::ptr_vector< Field > Fields
Fields is just a boost::ptr_vector of Field pointers.
Definition: Fields.h:37
virtual const DataSourceCapabilities & getCapabilities() const =0
It returns the known capabilities of the data source.
TESTATEXPORT void GetStringStatisticalSummary(std::vector< std::string > &values, te::stat::StringStatisticalSummary &ss)
A Select models a query to be used when retrieving data from a DataSource.
Definition: Select.h:65
boost::ptr_vector< FromItem > From
It models the FROM clause for a query.
Definition: From.h:37
virtual bool dataSetExists(const std::string &name)
It checks if a dataset with the given name exists in the data source.
Definition: DataSource.cpp:439
The subtraction operator.
Definition: Sub.h:46
A structure to hold the set of statistics from a set of categorical (sample) values.
Variance statistical function.
Definition: Variance.h:46
An exception class for the statistical module.
Max statistical function.
Definition: Max.h:46
TESTATEXPORT void GetPercentOfEachClassByArea(std::vector< double > &values, double &resolutionX, double &resolutionY, double &area, te::stat::NumericStatisticalSummary &ss, bool fullIntersection=true)
Configuration flags for the Terrralib Statistic module.