All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
SummaryFunctions.cpp
Go to the documentation of this file.
1 /* Copyright (C) 2008 National Institute For Space Research (INPE) - Brazil.
2 
3  This file is part of the TerraLib - a Framework for building GIS enabled applications.
4 
5  TerraLib is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as published by
7  the Free Software Foundation, either version 3 of the License,
8  or (at your option) any later version.
9 
10  TerraLib is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public License
16  along with TerraLib. See COPYING. If not, write to
17  TerraLib Team at <terralib-team@terralib.org>.
18  */
19 
20 /*!
21  \file SummaryFunctions.cpp
22 
23  \brief Statistical summary functions.
24  */
25 
26 //Terralib
27 #include "../../dataaccess/query_h.h"
28 #include "../../dataaccess/dataset/DataSet.h"
29 #include "../../dataaccess/dataset/DataSetType.h"
30 #include "../../dataaccess/datasource/DataSource.h"
31 #include "../../dataaccess/datasource/DataSourceCapabilities.h"
32 #include "../../dataaccess/datasource/DataSourceTransactor.h"
33 #include "../../dataaccess/utils/Utils.h"
34 #include "../../dataaccess/query_h.h"
35 #include "../../datatype/Property.h"
36 #include "../../maptools/AbstractLayer.h"
37 #include "../../maptools/DataSetLayer.h"
38 #include "Config.h"
39 #include "Exception.h"
40 #include "SummaryFunctions.h"
41 #include "Utils.h"
42 
43 // BOOST
44 #include <boost/lexical_cast.hpp>
45 
46 //STL
47 #include <algorithm>
48 #include <map>
49 #include <numeric>
50 #include <vector>
51 #include <memory>
52 
53 void te::stat::GetStringStatisticalSummary(std::vector<std::string>& values, te::stat::StringStatisticalSummary& ss, const std::string& nulValue)
54 {
55  if (values.empty())
56  return;
57 
58  std::vector<std::string> validValues;
59  for (size_t i=0; i<values.size(); ++i)
60  {
61  if (values[i] != nulValue)
62  validValues.push_back(values[i]);
63  }
64  GetStringStatisticalSummary(validValues, ss);
65  ss.m_count = values.size();
66  ss.m_validCount = validValues.size();
67 }
68 
70 {
71  if (values.empty())
72  return;
73 
74  std::sort(values.begin(), values.end());
75 
76  ss.m_minVal = *values.begin();
77  ss.m_maxVal = values[values.size() - 1];
78 
79  ss.m_count = values.size();
80 
81  for(std::size_t i = 0; i < values.size(); ++i)
82  {
83  if(!values[i].empty())
84  {
85  ++ss.m_validCount;
86  }
87  }
88 
89  ss.m_mode = Mode(values);
90 }
91 
92 void te::stat::GetNumericStatisticalSummary(std::vector<double>& values, te::stat::NumericStatisticalSummary& ss, double nulValue)
93 {
94  if (values.empty())
95  return;
96 
97  std::vector<double> validValues;
98  for (size_t i=0; i<values.size(); ++i)
99  {
100  if (values[i] != nulValue)
101  validValues.push_back(values[i]);
102  }
103  GetNumericStatisticalSummary(validValues, ss);
104  ss.m_count = values.size();
105  ss.m_validCount = validValues.size();
106 }
107 
109 {
110  if (values.empty())
111  return;
112 
113  std::sort(values.begin(), values.end());
114 
115  ss.m_minVal = *values.begin();
116  ss.m_maxVal = values[values.size() - 1];
117  ss.m_count = values.size();
118  ss.m_validCount = values.size();
119 
120  for(std::size_t i = 0; i < values.size(); ++i)
121  {
122  ss.m_sum += values[i];
123  }
124 
125  ss.m_mean = ss.m_sum/ss.m_count;
126 
127  for(int i = 0; i < ss.m_count; ++i)
128  {
129  double v= values[i];
130  ss.m_variance += pow((v-ss.m_mean),2);
131  ss.m_skewness += pow((v-ss.m_mean),3);
132  ss.m_kurtosis += pow((v-ss.m_mean),4);
133  }
134 
135  ss.m_variance /= ss.m_count;
136  ss.m_stdDeviation = pow(ss.m_variance, 0.5);
137  ss.m_skewness /= ss.m_count;
138  ss.m_skewness /= pow(ss.m_stdDeviation, 3);
139  ss.m_kurtosis /= ss.m_count;
140  ss.m_kurtosis /= pow(ss.m_stdDeviation, 4);
141 
142  ss.m_varCoeff = (100* ss.m_stdDeviation) / ss.m_mean;
143  ss.m_amplitude = ss.m_maxVal - ss.m_minVal;
144 
145  if((ss.m_count % 2) == 0)
146  ss.m_median = (values[(ss.m_count/2)] + values[(ss.m_count/2-1)])/2;
147  else
148  ss.m_median = values[(ss.m_count-1)/2];
149 
150  ss.m_mode = Mode(values);
151 }
152 
153 std::vector<double> te::stat::Mode(const std::vector<double>& values)
154 {
155  std::vector<double> mode;
156  if (values.empty())
157  return mode;
158 
159  bool found;
160  std::map<double, int> mapMode;
161 
162  for(std::size_t i = 0; i < values.size(); ++i)
163  {
164  found = false;
165 
166  if(!mapMode.empty())
167  {
168  std::map<double, int>::iterator itMode = mapMode.begin();
169 
170  while(itMode != mapMode.end())
171  {
172  if(itMode->first == values[i])
173  {
174  ++itMode->second;
175  found = true;
176  }
177 
178  ++itMode;
179  }
180  if(found == false)
181  {
182  mapMode.insert( std::map<double, int>::value_type( values[i] , 1 ) );
183  }
184  }
185  else
186  mapMode.insert( std::map<double, int>::value_type( values[i] , 1 ) );
187  }
188 
189  std::map<double, int>::iterator itMode = mapMode.begin();
190  int repeat = 0;
191 
192  while(itMode != mapMode.end())
193  {
194  if(itMode->second > 1)
195  {
196  if(repeat < itMode->second)
197  {
198  repeat = itMode->second;
199  mode.clear();
200  mode.push_back(itMode->first);
201  }
202  else if(repeat == itMode->second)
203  {
204  mode.push_back(itMode->first);
205  }
206  }
207 
208  ++itMode;
209  }
210 
211  return mode;
212 }
213 
214 std::string te::stat::Mode(const std::vector<std::string>& values)
215 {
216  if (values.empty())
217  return "";
218 
219  bool found;
220  std::string mode = "";
221  std::map<std::string, int> mapMode;
222 
223  for(std::size_t i = 0; i < values.size(); ++i)
224  {
225  found = false;
226 
227  if(!mapMode.empty())
228  {
229  std::map<std::string, int>::iterator itMode = mapMode.begin();
230 
231  while(itMode != mapMode.end())
232  {
233  if(itMode->first == values[i])
234  {
235  ++itMode->second;
236  found = true;
237  }
238 
239  ++itMode;
240  }
241  if(found == false)
242  {
243  mapMode.insert( std::map<std::string, int>::value_type( values[i] , 1 ) );
244  }
245  }
246  else
247  mapMode.insert( std::map<std::string, int>::value_type( values[i] , 1 ) );
248  }
249 
250  std::map<std::string, int>::iterator itMode = mapMode.begin();
251  int repeat = 0;
252 
253  while(itMode != mapMode.end())
254  {
255  if(repeat < itMode->second)
256  {
257  repeat = itMode->second;
258  mode = itMode->first;
259  }
260 
261  ++itMode;
262  }
263 
264  return mode;
265 }
266 
267 void te::stat::GetStringStatisticalSummaryQuery(const std::string& inDataset,
268  te::da::DataSource* inDatasource,
269  const std::string& propName,
271 {
272  assert(inDatasource);
273 
274  if (!inDatasource->dataSetExists(inDataset))
275  return;
276 
277  const te::da::DataSourceCapabilities dsCapabilities = inDatasource->getCapabilities();
278 
279  if(!dsCapabilities.supportsPreparedQueryAPI())
280  {
281  std::auto_ptr<te::da::DataSet> ds = inDatasource->getDataSet(inDataset);
282 
283  std::vector<std::string> stringVector = te::stat::GetStringData(ds.get(), propName);
284 
285  te::stat::GetStringStatisticalSummary(stringVector, ss);
286  }
287  else
288  {
289  te::da::Fields* fields = new te::da::Fields;
290 
291  te::da::PropertyName* p_name = new te::da::PropertyName(propName);
292 
293  te::da::Expression* e_min = new te::da::Min(p_name);
294  te::da::Field* f_min = new te::da::Field(*e_min, p_name->getName() + "_MIN_VALUE");
295 
296  te::da::Expression* e_max = new te::da::Max(p_name);
297  te::da::Field* f_max = new te::da::Field(*e_max, p_name->getName() + "_MAX_VALUE");
298 
299  te::da::Expression* e_count = new te::da::Count(p_name);
300  te::da::Field* f_count = new te::da::Field(*e_count, p_name->getName() + "_COUNT");
301 
302  te::da::Expression* e_validcount = new te::da::Count(p_name);
303  te::da::Field* f_validcount = new te::da::Field(*e_validcount, p_name->getName() + "_VALID_COUNT");
304 
305  fields->push_back(f_min);
306  fields->push_back(f_max);
307  fields->push_back(f_count);
308  fields->push_back(f_validcount);
309 
310  te::da::FromItem* fromItem = new te::da::DataSetName(inDataset);
311  te::da::From* from = new te::da::From;
312  from->push_back(fromItem);
313 
314  te::da::Select select(fields, from);
315 
316  std::auto_ptr<te::da::DataSet> dsQuery = inDatasource->query(select);
317 
318  if (!dsQuery.get())
319  return;
320 
321  dsQuery->moveFirst();
322 
323  ss.m_minVal = dsQuery->getAsString(1);
324  ss.m_maxVal = dsQuery->getAsString(2);
325  ss.m_count = dsQuery->getInt16(3);
326  ss.m_validCount = dsQuery->getInt16(4);
327 
328  // how to get the mode?
329  }
330 }
331 
332 void te::stat::GetNumericStatisticalSummaryQuery(const std::string& inDataset,
333  te::da::DataSource* inDatasource,
334  const std::string& propName,
336 {
337  assert(inDatasource);
338 
339  if (!inDatasource->dataSetExists(inDataset))
340  return;
341 
342  const te::da::DataSourceCapabilities dsCapabilities = inDatasource->getCapabilities();
343 
344  if(!dsCapabilities.supportsPreparedQueryAPI())
345  {
346  std::auto_ptr<te::da::DataSet> ds = inDatasource->getDataSet(inDataset);
347 
348  std::vector<double> numericVector = te::stat::GetNumericData(ds.get(), propName);
349 
350  te::stat::GetNumericStatisticalSummary(numericVector, ss);
351  }
352  else
353  {
354  te::da::PropertyName* p_name = new te::da::PropertyName(propName);
355 
356  te::da::Expression* e_min = new te::da::Min(p_name);
357  te::da::Field* f_min = new te::da::Field(*e_min, "MIN");
358 
359  te::da::Expression* e_max = new te::da::Max(p_name);
360  te::da::Field* f_max = new te::da::Field(*e_max, "MAX");
361 
362  te::da::Expression* e_count = new te::da::Count(p_name);
363  te::da::Field* f_count = new te::da::Field(*e_count, "COUNT");
364 
365  te::da::Expression* e_sum = new te::da::Sum(p_name);
366  te::da::Field* f_sum = new te::da::Field(*e_sum, "SUM");
367 
368  te::da::Expression* e_mean = new te::da::Avg(p_name);
369  te::da::Field* f_mean = new te::da::Field(*e_mean, "MEAN");
370 
371  te::da::Expression* e_stddev = new te::da::StdDev(p_name);
372  te::da::Field* f_stddev = new te::da::Field(*e_stddev, "STD_DEV");
373 
374  te::da::Expression* e_variance = new te::da::Variance(p_name);
375  te::da::Field* f_variance = new te::da::Field(*e_variance, "VARIANCE");
376 
377  te::da::Expression* e_amplitude = new te::da::Sub(*e_max, *e_min);
378  te::da::Field* f_amplitude = new te::da::Field(*e_amplitude, "AMPLITUDE");
379 
380  te::da::Fields* fields = new te::da::Fields;
381 
382  fields->push_back(f_min);
383  fields->push_back(f_max);
384  fields->push_back(f_count);
385  fields->push_back(f_sum);
386  fields->push_back(f_mean);
387  fields->push_back(f_stddev);
388  fields->push_back(f_variance);
389  fields->push_back(f_amplitude);
390 
391  te::da::FromItem* fromItem = new te::da::DataSetName(inDataset);
392  te::da::From* from = new te::da::From;
393  from->push_back(fromItem);
394 
395  te::da::Select select(fields, from);
396 
397  std::auto_ptr<te::da::DataSet> dsQuery = inDatasource->query(select);
398 
399  if (!dsQuery.get())
400  return;
401 
402  dsQuery->moveFirst();
403 
404  ss.m_minVal = boost::lexical_cast<double>(dsQuery->getAsString(0));
405  ss.m_maxVal = boost::lexical_cast<double>(dsQuery->getAsString(1));
406  ss.m_count = boost::lexical_cast<int>(dsQuery->getAsString(2));
407  ss.m_sum = boost::lexical_cast<double>(dsQuery->getAsString(3));
408  ss.m_mean = boost::lexical_cast<double>(dsQuery->getAsString(4));
409  ss.m_stdDeviation = boost::lexical_cast<double>(dsQuery->getAsString(5));
410  ss.m_variance = boost::lexical_cast<double>(dsQuery->getAsString(6));
411  ss.m_amplitude = boost::lexical_cast<double>(dsQuery->getAsString(7));
412  }
413 }
TESTATEXPORT void GetNumericStatisticalSummary(std::vector< double > &values, te::stat::NumericStatisticalSummary &ss, double nullVal)
const std::string & getName() const
It returns the property name.
Definition: PropertyName.h:80
TESTATEXPORT void GetNumericStatisticalSummaryQuery(const std::string &inDataset, te::da::DataSource *inDatasource, const std::string &propName, te::stat::NumericStatisticalSummary &ss)
A structure to hold the set of statistics from a set of numerical values.
Utility functions for the data access module.
An abstract class that models a source of data in a query.
Definition: FromItem.h:50
The Field class can be used to model an expression that takes part of the output items of a SELECT...
Definition: Field.h:50
A class that models the name of a dataset used in a From clause.
Definition: DataSetName.h:43
TESTATEXPORT void GetStringStatisticalSummaryQuery(const std::string &inDataset, te::da::DataSource *inDatasource, const std::string &propName, te::stat::StringStatisticalSummary &ss)
TESTATEXPORT std::vector< double > Mode(const std::vector< double > &values)
virtual std::auto_ptr< DataSet > query(const Select &q, te::common::TraverseType travType=te::common::FORWARDONLY, const te::common::AccessPolicy accessPolicy=te::common::RAccess)
It executes a query that may return some data using a generic query. This method always returns a dis...
Definition: DataSource.cpp:100
A class that models the name of any property of an object.
Definition: PropertyName.h:50
Count statistical function.
Definition: Count.h:46
A class that represents the known capabilities of a specific data source, i.e. this class informs all...
An abstract class for data providers like a DBMS, Web Services or a regular file. ...
Definition: DataSource.h:118
Avg statistical function.
Definition: Avg.h:46
Min statistical function.
Definition: Min.h:46
This is an abstract class that models a query expression.
Definition: Expression.h:47
StdDev statistical function.
Definition: StdDev.h:46
TESTATEXPORT std::vector< double > GetNumericData(te::da::DataSet *dataSet, const std::string propName)
Returns the values of a numeric type property in a vector of values.
Definition: Utils.cpp:158
A set of functions to calculate the statistic summary from a set of values.
TESAEXPORT double Sum(te::sa::GeneralizedProximityMatrix *gpm, int attrIdx)
Function used to calculate sum of a specific attribute from a gpm.
TESTATEXPORT std::vector< std::string > GetStringData(te::da::DataSet *dataSet, const std::string propName)
Returns the values of a string type property in a vector of values.
Definition: Utils.cpp:139
virtual std::auto_ptr< DataSet > getDataSet(const std::string &name, te::common::TraverseType travType=te::common::FORWARDONLY, const te::common::AccessPolicy accessPolicy=te::common::RAccess)
It gets the dataset identified by the given name. This method always returns a disconnected dataset...
Definition: DataSource.cpp:61
boost::ptr_vector< Field > Fields
Fields is just a boost::ptr_vector of Field pointers.
Definition: Fields.h:37
virtual const DataSourceCapabilities & getCapabilities() const =0
It returns the known capabilities of the data source.
TESTATEXPORT void GetStringStatisticalSummary(std::vector< std::string > &values, te::stat::StringStatisticalSummary &ss)
A Select models a query to be used when retrieving data from a DataSource.
Definition: Select.h:65
boost::ptr_vector< FromItem > From
It models the FROM clause for a query.
Definition: From.h:37
virtual bool dataSetExists(const std::string &name)
It checks if a dataset with the given name exists in the data source.
Definition: DataSource.cpp:439
The subtraction operator.
Definition: Sub.h:46
A structure to hold the set of statistics from a set of categorical (sample) values.
Variance statistical function.
Definition: Variance.h:46
An exception class for the statistical module.
Max statistical function.
Definition: Max.h:46
Configuration flags for the Terrralib Statistic module.