faif
Classifier.hpp
1 #ifndef FAIF_CLASIFIER_HPP_
2 #define FAIF_CLASIFIER_HPP_
3 
4 
5 #include <memory>
6 #include <map>
7 #include <algorithm>
8 #include <cmath>
9 
10 #include <boost/bind.hpp>
11 
12 #include <boost/serialization/serialization.hpp>
13 #include <boost/serialization/list.hpp>
14 #include <boost/serialization/nvp.hpp>
15 #include <boost/serialization/vector.hpp>
16 
17 #include "../Value.hpp"
18 #include "../Point.hpp"
19 #include "Belief.hpp"
20 
21 namespace faif {
22 
23  /** \brief machine learning namespace (mainly classifier algorithms)
24  */
25  namespace ml {
26 
27  /** \brief calculate x * log(x) value. If x == 0 return 0. */
28  inline double calcEntropy(double freq) {
29  if( freq > 0.0 )
30  return -(freq * std::log(freq));
31  else
32  return 0.0;
33  }
34 
35  /** \brief the clasiffier interface
36 
37  type definitions for AttrValue, AttrDomain, AttrIdd and others
38 
39  Store attribute domains and category domain, methods to create training/testing example,
40  load/save (boost::serialization), pure virtual for training and classifing.
41  */
42  template<typename Val>
43  class Classifier {
44  BOOST_CONCEPT_ASSERT((ValueConcept<Val>));
45  public:
46  typedef Val Value;
47 
48  /** \brief attribute value representation in learning */
49  typedef typename Val::Value AttrValue;
50 
51  /** \brief the attribute domain for learning */
52  typedef typename Val::DomainType AttrDomain;
53 
54  /** \brief attribute id representation in learning */
55  typedef typename Val::DomainType::ValueId AttrIdd;
56 
57  /** \brief for serialization the const interferes */
58  typedef typename Val::DomainType::ValueIdSerialize AttrIddSerialize;
59 
60  /** \breif the domain collection; the pointers should be valid */
62 
63  /** \brief collection of pair (AttrIdd, Probability) */
64  typedef typename Belief<Val>::Beliefs Beliefs;
65 
66  /** \brief the test example (collection of AttrIdd) */
68 
69  /** \brief the helping inner class to init PointAndFeature structure using getUnknownId method
70  */
71  template<typename Feature> struct InitValueId {
72  static Feature init() {
73  return Val::DomainType::getUnknownId();
74  }
75  };
76 
77  /** \brief the train example (test example and the category) */
79 
80 
81  /** \brief inner class - examples train collection */
82  class ExamplesTrain : public std::vector<ExampleTrain> {
83  public:
84  /** \brief the most common category in the training example container */
85  AttrIdd getMajorCategory() const;
86 
87  /** \brief entropy of set of examples */
88  double entropy() const;
89  private:
90  /** \brief serialization using boost::serialization */
91  friend class boost::serialization::access;
92 
93  template<class Archive>
94  void serialize( Archive &ar, const unsigned int file_version ){
95  ar & boost::serialization::make_nvp("Examples", boost::serialization::base_object< std::vector<ExampleTrain> >(*this) );
96  }
97 
98  };
99 
100  public:
101  Classifier() {}
102 
103  Classifier(const Domains& attr_domains, const AttrDomain& category_domain)
104  : domains_(attr_domains), category_(category_domain) {}
105 
106  virtual ~Classifier(){}
107 
108  /** \brief accessor */
109  const Domains& getAttrDomains() const { return domains_; }
110 
111  /** \brief accessor */
112  const AttrDomain& getCategoryDomain() const { return category_; }
113 
114  /** \brief accessor (helper) */
115  AttrIdd getCategoryIdd(const AttrValue& val) const { return category_.find(val); }
116 
117  /** \brief the clasiffier will have no knowledge */
118  virtual void reset() = 0;
119 
120  /** \brief learn classifier (on the collection of training examples) */
121  virtual void train(const ExamplesTrain&) = 0;
122 
123  /** \brief classify */
124  virtual AttrIdd getCategory(const ExampleTest& example) const = 0;
125 
126  /** \brief classify and return all classes with belief that the example is from each class
127  * classes are sorted from the best (index 0) to the worst */
128  virtual Beliefs getCategories(const ExampleTest& example) const = 0;
129 
130  /** the ostream method */
131  virtual void write(std::ostream& os) const;
132  private:
133  /** \brief serialization using boost::serialization */
134  friend class boost::serialization::access;
135 
136  template<class Archive>
137  void save(Archive & ar, const unsigned int /* file_version */) const {
138  //not used boost::serialization::list (for list of lists) because load require
139  //instantiate containter item and load to them
140  //because address_restarting works only once (I'm not sure, but this is my experience)
141  unsigned int size = static_cast<unsigned int>(domains_.size());
142  ar & boost::serialization::make_nvp("ClassifierDomainsCount", size );
143  for(typename Domains::const_iterator i = domains_.begin(); i != domains_.end(); ++i) {
144  ar & boost::serialization::make_nvp("ClassifierDomain", *i);
145  }
146  ar & boost::serialization::make_nvp("ClassifierCategory", category_);
147  }
148 
149  template<class Archive>
150  void load(Archive & ar, const unsigned int /* file_version */) {
151  unsigned int size;
152  ar & boost::serialization::make_nvp("ClassifierDomainsCount", size );
153  domains_.clear();
154  for(unsigned int i = 0; i < size; ++i) {
155  domains_.push_back(AttrDomain()); //add empty item to containter
156  AttrDomain& d = domains_.back(); //upload to this item
157  // so there is no need to additional address_restarting for boost::serialization
158  ar >> boost::serialization::make_nvp("ClassifierDomain", d);
159  }
160  ar & boost::serialization::make_nvp("ClassifierCategory", category_);
161  }
162 
163  template<class Archive>
164  void serialize( Archive &ar, const unsigned int file_version ){
165  boost::serialization::split_member(ar, *this, file_version);
166  }
167 
168  private:
169  Domains domains_;
170  AttrDomain category_;
171  };
172 
173  /** the ostream method */
174  template<typename Val>
175  void Classifier<Val>::write(std::ostream& os) const {
176  os << "Categories(" << category_.getSize() << ")" << category_ << std::endl;
177  os << "Attributes(" << static_cast<int>(domains_.size()) << "):";
178  std::copy(domains_.begin(), domains_.end(), std::ostream_iterator<AttrDomain>(os,"") );
179  }
180 
181  /**
182  ostream operator
183  */
184  template<typename Val>
185  std::ostream& operator<<(std::ostream& os, const Classifier<Val>& c) {
186  c.write(os);
187  return os;
188  }
189 
190  /** \brief create the test example from iterator range or C-like table of values */
191  template<typename It, typename Val>
193  createExample(It begin, It end, const Classifier<Val>& classifier) {
194  return classifier.getAttrDomains().createPoint(begin, end );
195  }
196 
197  /** \brief create the train example from range or C-like table of values */
198  template<typename It, typename Val>
200  createExample(It begin, It end, const typename Classifier<Val>::AttrValue& cat, const Classifier<Val>& classifier) {
202  return ExampleTrain( classifier.getAttrDomains().createPoint(begin, end ), classifier.getCategoryDomain().find(cat) );
203  }
204 
205  /** \brief create the test example from collection of pairs: attribute(domain) identifier and attribute value */
206  template<typename Val>
208  createExample(const std::vector<std::pair<std::string, typename Classifier<Val>::AttrValue> >& collection, const Classifier<Val>& classifier) {
209  return classifier.getAttrDomains().createPoint(collection);
210  }
211 
212  /** \brief create the test example from collection of pairs: attribute(domain) identifier and attribute value.
213  Throws exception if the string identifiers not match the required domains identifiers */
214  template<typename Val>
216  createExampleStrict(const std::vector<std::pair<std::string, typename Classifier<Val>::AttrValue> >& collection, const Classifier<Val>& classifier) {
217  return classifier.getAttrDomains().createPointStrict(collection);
218  }
219 
220  /** \brief create the train example from collection of pairs: attribute(domain) identifier and attribute value */
221  template<typename Val>
223  createExample(const std::vector<std::pair<std::string, typename Classifier<Val>::AttrValue> >& collection, const typename Classifier<Val>::AttrValue& cat,
224  const Classifier<Val>& classifier) {
226  return ExampleTrain( classifier.getAttrDomains().createPoint(collection), classifier.getCategoryDomain().find(cat) );
227  }
228 
229 
230  /** helping functor for calculate histogram based on categories for collection of train examples.
231  It could be used to find major category. */
232  template<typename Val>
234  public:
235  typedef typename Classifier<Val>::AttrDomain AttrDomain;
236  typedef typename Classifier<Val>::Domains Domains;
237  typedef typename Classifier<Val>::AttrIdd AttrIdd;
238  typedef typename Classifier<Val>::Beliefs Beliefs;
241 
242  typedef std::map<AttrIdd,int> Counters;
243 
244  /** \brief c-tor, empty counters */
246 
247  /** \brief c-tor, counters initialized by train examples collection */
248  TrainExampleCategoryCounters(typename ExamplesTrain::const_iterator beg,
249  typename ExamplesTrain::const_iterator end) : sum_(0) {
250  std::for_each( beg, end, boost::bind(&TrainExampleCategoryCounters::inc, this, _1 ) );
251  }
252 
253  //increment counters
254  void inc(const ExampleTrain& e) {
255 
256  ++sum_;
257  typename Counters::iterator it = counters_.find(e.getFeature() );
258  if(it == counters_.end() ) {
259  counters_.insert( std::make_pair(e.getFeature(), 1 ) );
260  }
261  else {
262  ++it->second;
263  }
264  }
265  //key for maximum value
266  AttrIdd maxCount() const {
267  typename Counters::const_iterator it =
268  std::max_element( counters_.begin(), counters_.end(),
269  boost::bind(&Counters::value_type::second, _1) < boost::bind(&Counters::value_type::second, _2) );
270  if(it != counters_.end() )
271  return it->first;
272  else
273  return Val::DomainType::getUnknownId(); //not found max i.e. empty container, return the unknown id
274  }
275 
276  /** \brief access to counters */
277  const Counters& get() const { return counters_; }
278 
279  /** \brief optimization: instead of accumulate all values from counters container keep the integer member */
280  int getSum() const { return sum_; }
281 
282  /** \brief entropy of counters */
283  double entropy() const {
284  double entr = 0.0;
285  if( sum_ > 0 ) {
286  double sum = static_cast<double>( sum_ );
287  for(typename Counters::const_iterator i = counters_.begin(); i != counters_.end(); ++i) {
288  entr += calcEntropy( static_cast<double>(i->second) / sum );
289  }
290  }
291  return entr;
292  }
293  /** \brief histogram from counters - Beliefs class where each position is counter divided by counters sum.
294 
295  Histogram is sorted from biggest to smallest probability
296  */
297  Beliefs getHistogram() const {
298  Beliefs histogram;
299  for(typename Counters::const_iterator i = counters_.begin(); i != counters_.end(); ++i) {
300  histogram.push_back( typename Beliefs::value_type(i->first,
301  static_cast<Probability>(i->second) / static_cast<Probability>(sum_) ) );
302  }
303  std::sort(histogram.begin(), histogram.end());
304  return histogram;
305  }
306  private:
307  Counters counters_;
308  int sum_;
309  };
310 
311  /** method implementation for Classifier<Val>::ExamplesTrain */
312  template<typename Val>
313  typename Classifier<Val>::AttrIdd
315  TrainExampleCategoryCounters<Val> counters(this->begin(), this->end());
316  return counters.maxCount();
317  }
318 
319  /** method implementation for Classifier<Val>::ExamplesTrain */
320  template<typename Val>
322  TrainExampleCategoryCounters<Val> counters(this->begin(), this->end());
323  return counters.entropy();
324  }
325 
326  } //namespace ml
327 } //namespace faif
328 
329 #endif //FAIF_CLASIFIER_HPP_
Val::DomainType::ValueId AttrIdd
attribute id representation in learning
Definition: Classifier.hpp:55
PointAndFeature< Val, AttrIdd, InitValueId > ExampleTrain
the train example (test example and the category)
Definition: Classifier.hpp:78
const Domains & getAttrDomains() const
accessor
Definition: Classifier.hpp:109
Definition: Chain.h:17
virtual AttrIdd getCategory(const ExampleTest &example) const =0
classify
Val::Value AttrValue
attribute value representation in learning
Definition: Classifier.hpp:49
virtual void train(const ExamplesTrain &)=0
learn classifier (on the collection of training examples)
Definition: Classifier.hpp:233
virtual Beliefs getCategories(const ExampleTest &example) const =0
classify and return all classes with belief that the example is from each class classes are sorted fr...
virtual void write(std::ostream &os) const
Definition: Classifier.hpp:175
point and some feature
Definition: Point.hpp:58
const AttrDomain & getCategoryDomain() const
accessor
Definition: Classifier.hpp:112
Belief< Val >::Beliefs Beliefs
collection of pair (AttrIdd, Probability)
Definition: Classifier.hpp:64
inner class - examples train collection
Definition: Classifier.hpp:82
double entropy() const
entropy of counters
Definition: Classifier.hpp:283
Point in n-space, each component of the same type.
Definition: Point.hpp:22
Beliefs getHistogram() const
histogram from counters - Beliefs class where each position is counter divided by counters sum...
Definition: Classifier.hpp:297
the value concept
Definition: Value.hpp:41
Val::DomainType::ValueIdSerialize AttrIddSerialize
for serialization the const interferes
Definition: Classifier.hpp:58
double calcEntropy(double freq)
calculate x * log(x) value. If x == 0 return 0.
Definition: Classifier.hpp:28
Classifier< Val >::ExampleTest createExampleStrict(const std::vector< std::pair< std::string, typename Classifier< Val >::AttrValue > > &collection, const Classifier< Val > &classifier)
create the test example from collection of pairs: attribute(domain) identifier and attribute value...
Definition: Classifier.hpp:216
TrainExampleCategoryCounters(typename ExamplesTrain::const_iterator beg, typename ExamplesTrain::const_iterator end)
c-tor, counters initialized by train examples collection
Definition: Classifier.hpp:248
virtual void reset()=0
the clasiffier will have no knowledge
int getSum() const
optimization: instead of accumulate all values from counters container keep the integer member ...
Definition: Classifier.hpp:280
Val::DomainType AttrDomain
the attribute domain for learning
Definition: Classifier.hpp:52
TrainExampleCategoryCounters()
c-tor, empty counters
Definition: Classifier.hpp:245
Classifier< Val >::ExampleTest createExample(It begin, It end, const Classifier< Val > &classifier)
create the test example from iterator range or C-like table of values
Definition: Classifier.hpp:193
Point< Val > ExampleTest
the test example (collection of AttrIdd)
Definition: Classifier.hpp:67
AttrIdd getCategoryIdd(const AttrValue &val) const
accessor (helper)
Definition: Classifier.hpp:115
Space< AttrDomain > Domains
Definition: Classifier.hpp:61
the helping inner class to init PointAndFeature structure using getUnknownId method ...
Definition: Classifier.hpp:71
the clasiffier interface
Definition: Classifier.hpp:43
AttrIdd getMajorCategory() const
the most common category in the training example container
Definition: Classifier.hpp:314
double entropy() const
entropy of set of examples
Definition: Classifier.hpp:321