faif
RandomForest.hpp
1 // The Random Forest classifier
2 
3 #ifndef FAIF_RANDOM_FOREST_HPP
4 #define FAIF_RANDOM_FOREST_HPP
5 
6 #if defined(_MSC_VER) && (_MSC_VER >= 1400)
7 //msvc14.0 warnings for Boost.Serialization
8 #pragma warning(disable:4100)
9 #pragma warning(disable:4512)
10 #endif
11 
12 
13 #include "Classifier.hpp"
14 #include "DecisionTree.hpp"
15 #include "../utils/Random.hpp"
16 
17 #include <list>
18 #include <set>
19 #include <algorithm>
20 #include <iterator>
21 #include <limits>
22 #include <cassert>
23 #include <memory>
24 
25 #include <boost/serialization/list.hpp>
26 #include <boost/serialization/base_object.hpp>
27 #include <boost/serialization/nvp.hpp>
28 #include <boost/serialization/vector.hpp>
29 #include <boost/serialization/shared_ptr.hpp>
30 
31 namespace faif {
32  namespace ml {
33 
34  /** \brief random forest's parameters*/
36  RandomForestParams() : numFeaturesPerTree(1), numTrees(2),\
37  allowedNbrMiscEx(1), minInfGain(0.000001) {}
38  // number of features assigned per tree
39  int numFeaturesPerTree;
40  // number of trees in the forest
41  int numTrees;
42  //allowed number of badly classified examples for each category
43  int allowedNbrMiscEx;
44  // minimal information gain to accept test
45  double minInfGain;
46 
47  template<class Archive>
48  void serialize( Archive &ar, const unsigned int /* file_version*/ ){
49  ar & boost::serialization::make_nvp("numFeaturesPerTree", numFeaturesPerTree );
50  ar & boost::serialization::make_nvp("numTrees", numTrees );
51  ar & boost::serialization::make_nvp("allowedNbrMiscEx", allowedNbrMiscEx );
52  ar & boost::serialization::make_nvp("minInfGain", minInfGain );
53  }
54  };
55 
56  /** \brief Random Forest Classifier.
57 
58  Contains the attributes, attribute values and categories,
59  train examples, test examples and classifier methods.
60  */
61  template<typename Val>
62  class RandomForest : public Classifier<Val> {
63  public:
64  typedef typename Classifier<Val>::AttrValue AttrValue;
65  typedef typename Classifier<Val>::AttrDomain AttrDomain;
66  typedef typename Classifier<Val>::AttrIdd AttrIdd;
68  typedef typename Classifier<Val>::Domains Domains;
69  typedef typename Classifier<Val>::Beliefs Beliefs;
73  public:
74  RandomForest();
75  RandomForest(const Domains& attr_domains, const AttrDomain& category_domain);
76 
77  virtual ~RandomForest() { }
78 
79  /** \brief get number of trees as per Breiman's recommendation
80  * \param dataSize number of examples in the training dataset
81  * \param numFeatures number of features in the training dataset
82  */
83  static int getBreimanNumTrees(size_t dataSize, size_t numFeatures) {
84  return static_cast<int>(std::max(ceil(sqrt(2*dataSize)),\
85  ceil(2*numFeatures/ceil(sqrt(numFeatures)))));
86  }
87 
88  /** \brief get number of features per tree as recommended by Breiman
89  * \param numFeatures number of features in the training dataset
90  */
91  static int getBreimanNumFeatures(size_t numFeatures) { return ceil(sqrt(numFeatures)); }
92 
93  /** accessor - get training parameters */
94  const RandomForestParams& getTrainParams() const { return params_; }
95 
96  /** mutator - set training parameters */
97  void setTrainParams(const RandomForestParams& p) { params_ = p; }
98 
99  /** clear forest */
100  virtual void reset();
101 
102  /** \brief learn classifier (on the collection of training examples).
103  */
104  virtual void train(const ExamplesTrain& e);
105 
106  /** classify */
107  virtual AttrIdd getCategory(const ExampleTest&) const;
108 
109  /** \brief classify and return all classes with belief that the example is from given class
110  */
111  virtual Beliefs getCategories(const ExampleTest&) const;
112  private:
113  /** \brief create a set of trees using bootstrapping
114  */
115  void createForest();
116 
117  /**
118  \brief transform a collection of trees results into beliefs using the voting idea
119  \param list_ collection of the results of all classifiers
120  */
121  Beliefs prepareResults(const std::list<typename RandomForest<Val>::AttrIdd>&) const;
122 
123  /** \brief randomly select a subset of ExamplesTrain
124  size of subset = size of ExamplesTrain
125  */
126  ExamplesTrain exampleBootstrap(const ExamplesTrain&);
127 
128  /** \brief generate pseudorandom collection of numbers with uniform distribution
129  \param size size of returning collection (number of digits to generate)
130  \param range the upper bound of the sampling set (lower bound is 0)
131  \param gen random engine generator
132  \param isReplacementON if the sampling is to be with replacement or not
133  */
134  std::vector<int> uniformRandomGenerator(std::size_t size, std::size_t range, bool isReplacementON);
135 
136  /** copy c-tor not allowed */
137  RandomForest(const RandomForest&);
138 
139  /** assignment not allowed */
140  RandomForest& operator=(const RandomForest&);
141 
142  /**
143  internal class - implementation of a decision tree with attributes covering (based on the vector of allowed attributes numbers)
144  */
145  class RandomTree : public DecisionTree<Val>
146  {
147  public:
148  RandomTree(): DecisionTree<Val>(){}
149  RandomTree(const Domains& attr_domains, const AttrDomain& category_domain)
150  : DecisionTree<Val>(attr_domains, category_domain){
151  }
152 
153  /** clear forest */
154  void reset()
155  {
157  }
158 
159  /** \brief learn classifier (on the collection of training examples).
160  */
161  void train(const ExamplesTrain& e)
162  {
164  }
165 
166  /** classify */
167  AttrIdd getCategory(const ExampleTest& e) const
168  {
170  }
171 
172  /** \brief classify and return all classes with belief that the example is from given class
173  */
174  typename RandomForest<Val>::Beliefs getCategories(const ExampleTest& e) const
175  {
177  }
178 
179  template < class Tcontainer >
180  static Tcontainer CoverDomains(Tcontainer domains_, std::vector<int> attribs_allowed)
181  {
182  Tcontainer newDomains_;
183  for (std::vector<int>::const_iterator it=attribs_allowed.begin(); it != attribs_allowed.end(); ++it)
184  {
185  typename Tcontainer::iterator it_d=domains_.begin();
186  std::advance(it_d,*it);
187  newDomains_.push_back(*it_d);
188  //std::cout << *it_d << std::endl;
189  }
190  return newDomains_;
191  }
192  private:
193  /** \brief serialization using boost::serialization */
194  friend class boost::serialization::access;
195 
196  template<class Archive>
197  void serialize( Archive &ar, const unsigned int /* file_version*/ ){
198  ar & boost::serialization::make_nvp("BaseTree",boost::serialization::base_object<DecisionTree<Val> >(*this));
199 
200  }
201 
202  };
203 
204  private:
205  /** \brief serialization using boost::serialization */
206  friend class boost::serialization::access;
207 
208  template<class Archive>
209  void serialize( Archive &ar, const unsigned int /* file_version*/ ){
210  ar.template register_type<RandomTree>();
211  ar.template register_type<RandomForestParams>();
212 
213  ar & boost::serialization::make_nvp("RFCBase", boost::serialization::base_object<Classifier<Val> >(*this) );
214  ar & boost::serialization::make_nvp("RTrees", trees_ );
215  ar & boost::serialization::make_nvp("params", params_);
216  }
217 
218  private:
219  /** collection of trees */
220  typedef std::list<boost::shared_ptr<RandomTree>> RTrees;
221  RTrees trees_;
222  // parameters of the random forest
223  RandomForestParams params_;
224  }; //class RandomForest
225 
226  //////////////////////////////////////////////////////////////////////////////////////////////////
227  // class RandomForest implementation
228  //////////////////////////////////////////////////////////////////////////////////////////////////
229 
230  template<typename Val>
232 
233  template<typename Val>
234  RandomForest<Val>::RandomForest(const Domains& attr_domains, const AttrDomain& category_domain)
235  : Classifier<Val>(attr_domains, category_domain){}
236 
237  /** clear the forest */
238  template<typename Val>
240  {
241  trees_.clear();
242  }
243 
244  template<typename Val>
245  void RandomForest<Val>::train(const ExamplesTrain& e)
246  {
247  createForest();
248  for( typename RTrees::iterator it=trees_.begin(); it != trees_.end(); it++ )
249  {
250  RandomTree & obj = *(*it); // dereference iterator, dereference pointer
251  obj.train(exampleBootstrap(e));
252  //std::cout << obj << std::endl << std::endl;
253  }
254  }
255 
256  template<typename Val>
257  typename RandomForest<Val>::AttrIdd RandomForest<Val>::getCategory(const ExampleTest& e) const
258  {
259  //std::list< typename Val::Value> results_;
260  std::list< RandomForest<Val>::AttrIdd> results_;
261  for( typename RTrees::const_iterator it=trees_.begin(); it != trees_.end(); it++ )
262  {
263  RandomTree & obj = *(*it); // dereference iterator, dereference pointer
264  results_.push_back(obj.getCategory(e));
265  }
266  Beliefs bel_ = prepareResults(results_);
267  if(bel_.empty() )
268  return AttrDomain::getUnknownId();
269  else
270  return bel_.at(0).getValue();
271  }
272 
273  template<typename Val>
274  typename RandomForest<Val>::Beliefs RandomForest<Val>::getCategories(const ExampleTest& e) const
275  {
276  std::list< RandomForest<Val>::AttrIdd> results_;
277  for( typename RTrees::const_iterator it=trees_.begin(); it != trees_.end(); it++ )
278  {
279  RandomTree & obj = *(*it); // dereference iterator, dereference pointer
280  results_.push_back(obj.getCategory(e));
281  //std::cout << obj.getCategory(e)->get() << std::endl;
282  }
283  Beliefs bel_ = prepareResults(results_);
284  if(bel_.empty() )
285  return Beliefs();
286  else
287  return bel_;
288  }
289 
290  template<typename Val>
292  {
293  size_t size_ = Classifier<Val>::getAttrDomains().size();
294  // create collection of trees with random attributes
296  p.allowedNbrMiscEx = params_.allowedNbrMiscEx;
297  p.minInfGain = params_.minInfGain;
298  for(int i=0; i < params_.numTrees; ++i) {
299  auto tree = boost::shared_ptr<RandomTree>(new RandomTree(
300  RandomTree::CoverDomains(
302  uniformRandomGenerator(params_.numFeaturesPerTree, size_-1, false)),
304  tree->setTrainParams(p);
305  trees_.push_back(tree);
306  }
307  }
308 
309  template<typename Val>
310  typename RandomForest<Val>::Beliefs RandomForest<Val>::prepareResults(const std::list<typename RandomForest<Val>::AttrIdd>& list_) const
311  {
312  std::vector<std::pair<typename RandomForest<Val>::AttrIdd,double>> resultsList_;
313  std::map<typename RandomForest<Val>::AttrIdd,double> count_;
314  Beliefs toRet_;
315  // Create map of occurrence
316  for( typename std::list<typename RandomForest<Val>::AttrIdd>::const_iterator it=list_.begin(); it != list_.end(); it++ )
317  {
318  // Get the feature name
319  typename Val::Value class_ = ((typename RandomForest<Val>::AttrIdd) *it)->get();
320  // And check whether this feature already exists in map
321  auto itClass_ = find_if(count_.begin(), count_.end(), [&class_](const std::pair<typename RandomForest<Val>::AttrIdd,double>& obj) {return obj.first->get()==class_;});
322  // Create new element if not or increase the counter if so
323  if (itClass_ != std::end(count_))
324  ++itClass_->second;
325  else
326  ++count_[*it];
327  }
328  // Normalize map values by list_ size - switch replicates into probability
329  // Convert std::map into std::vector of pairs
330  for( typename std::map<typename RandomForest<Val>::AttrIdd,double>::const_iterator it = count_.begin(); it != count_.end(); ++it )
331  resultsList_.push_back(std::pair<typename RandomForest<Val>::AttrIdd,double>(it->first,it->second/list_.size()));
332  // Sort a vector of pairs based on the probability
333  std::sort(resultsList_.begin(), resultsList_.end(),
334  boost::bind(&std::pair<typename RandomForest<Val>::AttrIdd,double>::second, _1) >
335  boost::bind(&std::pair<typename RandomForest<Val>::AttrIdd,double>::second, _2));
336  // Create Beliefs
337  for( typename std::vector<std::pair<typename RandomForest<Val>::AttrIdd,double>>::const_iterator it = resultsList_.begin(); it != resultsList_.end(); ++it )
338  toRet_.push_back(typename Beliefs::value_type(it->first,it->second));
339  return toRet_;
340  }
341 
342  template<typename Val>
343  typename RandomForest<Val>::ExamplesTrain RandomForest<Val>::exampleBootstrap(const ExamplesTrain& example_)
344  {
345  size_t S_ = example_.size();
346  ExamplesTrain subset_;
347  std::vector<int> sample_ = uniformRandomGenerator(S_, S_-1, true);
348  for( std::vector<int>::iterator it = sample_.begin(); it != sample_.end(); ++it )
349  subset_.push_back(*std::next(example_.begin(),*it));
350 
351  return subset_;
352  }
353 
354  template<typename Val>
355  std::vector<int> RandomForest<Val>::uniformRandomGenerator(std::size_t size, std::size_t range, bool isReplacementON)
356  {
357  if (size > range + 1 && !isReplacementON) {
358  throw std::invalid_argument("Size can not be bigger than range if replacement isn't on!");
359  }
360  std::vector<int> toRet_(size, -1);
361  // \-> by default whole vector is initiate with zeros
362  // and it would be dangerous if we generate numbers from zero
363  RandomInt uniform_generator(0, static_cast<int>(range) );
364  for( std::vector<int>::iterator it = toRet_.begin(); it != toRet_.end(); ++it )
365  {
366  int sample_ = uniform_generator();
367  while( !isReplacementON && std::find(toRet_.begin(), toRet_.end(), sample_) != toRet_.end() )
368  {
369  sample_ = uniform_generator();
370  }
371  *it = sample_;
372  }
373 
374  return toRet_;
375  }
376 
377 
378  }//namespace ml
379 }//namespace faif
380 
381 #endif //FAIF_RANDOM_FOREST_HPP
382 
Val::DomainType::ValueId AttrIdd
attribute id representation in learning
Definition: Classifier.hpp:55
const Domains & getAttrDomains() const
accessor
Definition: Classifier.hpp:109
Definition: Chain.h:17
virtual void train(const ExamplesTrain &e)
learn classifier (on the collection of training examples).
Definition: RandomForest.hpp:245
virtual AttrIdd getCategory(const ExampleTest &) const
Definition: RandomForest.hpp:257
Val::Value AttrValue
attribute value representation in learning
Definition: Classifier.hpp:49
Decision Tree Classifier.
Definition: DecisionTree.hpp:63
The Decision Tree Classifier, inspired ID3 algorithm (Iterate Dichotomizer)
point and some feature
Definition: Point.hpp:58
static int getBreimanNumTrees(size_t dataSize, size_t numFeatures)
get number of trees as per Breiman&#39;s recommendation
Definition: RandomForest.hpp:83
Belief< Val >::Beliefs Beliefs
collection of pair (AttrIdd, Probability)
Definition: Classifier.hpp:64
inner class - examples train collection
Definition: Classifier.hpp:82
virtual AttrIdd getCategory(const ExampleTest &) const
Definition: DecisionTree.hpp:401
const RandomForestParams & getTrainParams() const
Definition: RandomForest.hpp:94
virtual void train(const ExamplesTrain &e)
learn classifier (on the collection of training examples).
Definition: DecisionTree.hpp:369
Point in n-space, each component of the same type.
Definition: Point.hpp:22
virtual void reset()
Definition: RandomForest.hpp:239
virtual Beliefs getCategories(const ExampleTest &) const
classify and return all classes with belief that the example is from given class
Definition: DecisionTree.hpp:413
Val::DomainType::ValueIdSerialize AttrIddSerialize
for serialization the const interferes
Definition: Classifier.hpp:58
the uniform distribution for int, in range <min,max>, uses RandomSingleton
Definition: Random.hpp:107
virtual Beliefs getCategories(const ExampleTest &) const
classify and return all classes with belief that the example is from given class
Definition: RandomForest.hpp:274
param for training decision tree
Definition: DecisionTree.hpp:45
random forest&#39;s parameters
Definition: RandomForest.hpp:35
Val::DomainType AttrDomain
the attribute domain for learning
Definition: Classifier.hpp:52
virtual void reset()
Definition: DecisionTree.hpp:359
void setTrainParams(const RandomForestParams &p)
Definition: RandomForest.hpp:97
static int getBreimanNumFeatures(size_t numFeatures)
get number of features per tree as recommended by Breiman
Definition: RandomForest.hpp:91
the clasiffier interface
Definition: Classifier.hpp:43
Random Forest Classifier.
Definition: RandomForest.hpp:62