3 #ifndef FAIF_NAIVE_BAYESIAN_HPP_ 4 #define FAIF_NAIVE_BAYESIAN_HPP_ 6 #if defined(_MSC_VER) && (_MSC_VER >= 1400) 8 #pragma warning(disable:4100) 9 #pragma warning(disable:4512) 16 #include <boost/bind.hpp> 17 #include <boost/serialization/split_member.hpp> 18 #include <boost/serialization/base_object.hpp> 19 #include <boost/serialization/nvp.hpp> 20 #include <boost/serialization/map.hpp> 22 #include "Classifier.hpp" 32 template<
typename Val>
46 NaiveBayesian(
const Domains& attr_domains,
const AttrDomain& category_domain);
53 virtual void train(
const ExamplesTrain& e) {
58 virtual AttrIdd
getCategory(
const ExampleTest&)
const;
67 virtual void write(std::ostream& os)
const;
79 void loadSaveState()
const;
84 template<
class Archive>
85 void save(Archive & ar,
const unsigned int )
const;
87 template<
class Archive>
88 void load(Archive & ar,
const unsigned int );
90 template<
class Archive>
91 void serialize( Archive &ar,
const unsigned int file_version ){
92 boost::serialization::split_member(ar, *
this, file_version);
101 class NaiveBayesianTraining;
103 std::unique_ptr<NaiveBayesianTraining> impl_;
111 template<
class T>
class CategoryData {
113 typedef std::map<AttrIdd,T> AttrData;
115 CategoryData() : data_(0), attrData_() { }
116 CategoryData(
const T& d) : data_(d), attrData_() { }
117 CategoryData(
const T& d,
const AttrData& ad) : data_(d), attrData_(ad) { }
118 CategoryData(
const CategoryData& cd) : data_(cd.data_), attrData_(cd.attrData_) { }
119 CategoryData& operator=(
const CategoryData& cd) {
121 attrData_ = cd.attrData_;
130 friend class boost::serialization::access;
132 template<
class Archive>
133 void save(Archive & ar,
const unsigned int )
const {
134 ar << boost::serialization::make_nvp(
"Category", data_ );
135 ar << boost::serialization::make_nvp(
"Data", attrData_ );
138 template<
class Archive>
139 void load(Archive & ar,
const unsigned int ) {
140 ar >> boost::serialization::make_nvp(
"Category", data_ );
141 typedef std::map<AttrIddSerialize,T> Map;
143 ar >> boost::serialization::make_nvp(
"Data", m );
145 for(
typename Map::const_iterator ii = m.begin(); ii != m.end(); ++ii) {
147 attrData_.insert(
typename AttrData::value_type(ii->first, ii->second) );
151 template<
class Archive>
152 void serialize( Archive &ar,
const unsigned int file_version ){
153 boost::serialization::split_member(ar, *
this, file_version);
161 class NaiveBayesianTraining {
165 typedef std::map<AttrIdd, CategoryData<int> > CategoryCounters;
167 typedef typename CategoryData<int>::AttrData SimpleCounters;
170 NaiveBayesianTraining() : parent_(0L) {}
172 NaiveBayesianTraining(
NaiveBayesian& parent) : parent_(&parent) {}
174 virtual ~NaiveBayesianTraining() {}
177 virtual void addTraining(
const ExampleTrain& example);
180 virtual AttrIdd
getCategory(
const ExampleTest& example) {
181 return parent_->switchGetCategory(example);
186 return parent_->switchGetCategories(example);
190 virtual void loadSaveState() {
191 parent_->switchLoadSaveState();
195 virtual void write(std::ostream& os)
const;
198 int getCategoryCounter(AttrIdd cat_val)
const;
201 int getCategoryValCounter(AttrIdd cat_val, AttrIdd value)
const;
206 friend class boost::serialization::access;
207 template<
class Archive>
208 void serialize(Archive & ar,
const unsigned int ){
210 ar & boost::serialization::make_nvp(
"Parent", parent_ );
215 CategoryCounters counters_;
217 NaiveBayesianTraining(
const NaiveBayesianTraining&);
219 NaiveBayesianTraining& operator=(
const NaiveBayesianTraining&);
224 class NaiveBayesianClasify :
public NaiveBayesianTraining {
226 typedef typename CategoryData<Probability>::AttrData Counters;
228 typedef std::map<AttrIdd, CategoryData<Probability> > InternalProbabilities;
231 NaiveBayesianClasify() {}
233 NaiveBayesianClasify(
NaiveBayesian& parent,
const NaiveBayesianTraining& nb_train) : NaiveBayesianTraining(parent) {
238 virtual ~NaiveBayesianClasify() {}
241 virtual void addTraining(
const ExampleTrain& example) {
242 this->parent_->switchAddTraining(example);
246 virtual AttrIdd
getCategory(
const ExampleTest& example);
252 virtual void loadSaveState() {
257 virtual void write(std::ostream& os)
const;
260 Probability getCategoryCounter(AttrIdd cat_val)
const;
262 Probability getCategoryCounterLog(AttrIdd cat_val)
const;
264 Probability getCategoryValCounter(AttrIdd cat_val, AttrIdd value)
const;
266 Probability getCategoryValCounterLog(AttrIdd cat_val, AttrIdd value)
const;
269 friend class boost::serialization::access;
271 template<
class Archive>
272 void save(Archive & ar,
const unsigned int )
const;
274 template<
class Archive>
275 void load(Archive & ar,
const unsigned int );
277 template<
class Archive>
278 void serialize( Archive &ar,
const unsigned int file_version ){
279 boost::serialization::split_member(ar, *
this, file_version);
283 InternalProbabilities probabl_;
285 void calculate(
const NaiveBayesianTraining& nb_train);
287 Probability calcProbabilityForExample(
const ExampleTest& example, AttrIdd cat_val)
const;
290 NaiveBayesianClasify(
const NaiveBayesianClasify&);
292 NaiveBayesianClasify& operator=(
const NaiveBayesianClasify&);
302 template<
typename Val>
305 impl_.reset(
new NaiveBayesianTraining(*
this) );
308 template<
typename Val>
310 : Classifier<Val>(attr_domains, category_domain)
312 impl_.reset(
new NaiveBayesianTraining(*
this) );
316 template<
typename Val>
318 impl_.reset(
new NaiveBayesianTraining(*
this) );
322 template<
typename Val>
323 typename NaiveBayesian<Val>::AttrIdd
325 return impl_->getCategory(example);
329 template<
typename Val>
330 typename NaiveBayesian<Val>::Beliefs
332 return impl_->getCategories(example);
336 template<
typename Val>
338 impl_->addTraining(example);
342 template<
typename Val>
345 os << std::endl <<
"State: ";
351 template<
typename Val>
353 impl_->loadSaveState();
357 template<
typename Val>
358 typename NaiveBayesian<Val>::AttrIdd
360 NaiveBayesianClasify* classify =
new NaiveBayesianClasify(*
this, *impl_.get());
361 impl_.reset( classify );
366 template<
typename Val>
367 typename NaiveBayesian<Val>::Beliefs
369 NaiveBayesianClasify* classify =
new NaiveBayesianClasify(*
this, *impl_.get());
370 impl_.reset( classify );
375 template<
typename Val>
382 template<
typename Val>
384 NaiveBayesianClasify* classify =
new NaiveBayesianClasify(*
this, *impl_.get());
385 impl_.reset( classify );
389 template<
typename Val>
390 template<
class Archive>
392 ar.template register_type<NaiveBayesianClasify>();
393 ar << boost::serialization::make_nvp(
"NBCBase", boost::serialization::base_object<
Classifier<Val> >(*
this) );
395 const NaiveBayesianTraining*
const t = impl_.get();
396 ar << boost::serialization::make_nvp(
"NBCImpl",t);
399 template<
typename Val>
400 template<
class Archive>
402 ar.template register_type<NaiveBayesianClasify>();
403 ar >> boost::serialization::make_nvp(
"NBCBase", boost::serialization::base_object<
Classifier<Val> >(*
this) );
404 NaiveBayesianTraining* t;
405 ar >> boost::serialization::make_nvp(
"NBCImpl",t);
415 template<
class Categories>
418 typedef typename Categories::Classifier Cl;
419 typedef typename Cl::AttrIdd AttrIdd;
420 typedef typename Cl::AttrDomain AttrDomain;
421 typedef typename Cl::Domains Domains;
422 typedef typename Cl::ExamplesTrain ExamplesTrain;
426 const Categories& categories_;
429 : os_(os), catVal_(cat_val), categories_(categories) {
431 void operator()(
const AttrDomain& attr) {
432 for(
typename AttrDomain::const_iterator ii = attr.begin(); ii != attr.end(); ++ii ) {
433 AttrIdd val = &(*ii);
435 os_ << val->get() <<
"(" << categories_.getCategoryValCounter(catVal_, val) <<
"),";
444 template<
class Categories>
447 typedef typename Categories::Classifier Cl;
448 typedef typename Cl::AttrIdd AttrIdd;
449 typedef typename Cl::AttrDomain AttrDomain;
450 typedef typename Cl::Domains Domains;
451 typedef typename Cl::ExamplesTrain ExamplesTrain;
452 typedef typename Cl::Value Value;
455 const Domains& attributes_;
456 const Categories& categories_;
459 : os_(os), attributes_(attrib), categories_(categories) {
463 void operator()(
const Value& cat_val) {
464 os_ << cat_val <<
"(" << categories_.getCategoryCounter(&cat_val) <<
"):" << std::endl;
466 std::for_each( attributes_.begin(), attributes_.end(), printCountAttr );
477 template<
typename Val>
479 AttrIdd cat_val = example.getFeature();
480 typename CategoryCounters::iterator ii = counters_.find(cat_val);
481 if( ii != counters_.end() )
482 ++(*ii).second.data_;
484 ii = counters_.insert(
typename CategoryCounters::value_type(cat_val,1) ).first;
485 assert( ii != counters_.end() );
486 SimpleCounters& count = (*ii).second.attrData_;
487 for(
typename ExampleTrain::const_iterator i = example.begin(); i != example.end(); ++i ) {
489 typename SimpleCounters::iterator iii = count.find(value);
490 if( iii != count.end() )
493 count.insert( std::pair<AttrIdd,int>(value,1) );
498 template<
typename Val>
500 os <<
"TRAINING:" << std::endl;
502 std::for_each(parent_->getCategoryDomain().begin(), parent_->getCategoryDomain().end(), printCounters );
507 template<
typename Val>
509 typename CategoryCounters::const_iterator ii = counters_.find(cat_val);
510 if( ii != counters_.end() )
511 return (*ii).second.data_;
517 template<
typename Val>
519 typename CategoryCounters::const_iterator ii = counters_.find(cat_val);
520 if( ii == counters_.end() )
522 const SimpleCounters& count = (*ii).second.attrData_;
524 typename SimpleCounters::const_iterator jj = count.find(value);
525 if( jj != count.end() )
536 template<
typename Val>
537 typename NaiveBayesian<Val>::AttrIdd
539 if( probabl_.empty() )
540 return AttrDomain::getUnknownId();
542 AttrIdd cat_val_max = probabl_.begin()->first;
543 Probability max_prob = -std::numeric_limits<Probability>::max();
545 for(
typename InternalProbabilities::const_iterator ii = probabl_.begin(); ii != probabl_.end(); ++ii ) {
546 AttrIdd cat_val = (*ii).first;
547 Probability prob = calcProbabilityForExample(example, cat_val);
548 if( prob > max_prob ) {
550 cat_val_max = cat_val;
557 template<
typename Val>
558 typename NaiveBayesian<Val>::Beliefs
560 Probability sum = 0.0;
561 for(
typename InternalProbabilities::const_iterator ii = probabl_.begin(); ii != probabl_.end(); ++ii ) {
562 sum += std::exp( calcProbabilityForExample(example, (*ii).first ) );
566 for(
typename InternalProbabilities::const_iterator ii = probabl_.begin(); ii != probabl_.end(); ++ii ) {
567 AttrIdd cat_val = (*ii).first;
568 Probability prob = exp( calcProbabilityForExample(example, cat_val) ) / sum;
569 toRet.push_back(
typename Beliefs::value_type(cat_val, prob));
571 std::sort( toRet.begin(), toRet.end() );
576 template<
typename Val>
578 os <<
"CLASIFY:" << std::endl;
580 std::for_each(this->parent_->getCategoryDomain().begin(), this->parent_->getCategoryDomain().end(), printCounters );
584 template<
typename Val>
586 return std::exp( getCategoryCounterLog(cat_val) );
590 template<
typename Val>
592 typename InternalProbabilities::const_iterator ii = probabl_.find(cat_val);
593 if( ii != probabl_.end() )
594 return (*ii).second.data_;
600 template<
typename Val>
602 return exp( getCategoryValCounterLog(cat_val,value) );
606 template<
typename Val>
608 typename InternalProbabilities::const_iterator ii = probabl_.find(cat_val);
609 if( ii == probabl_.end() )
612 const Counters& counters = (*ii).second.attrData_;
614 typename Counters::const_iterator jj = counters.find(value);
615 if( jj != counters.end() )
623 Probability calcProbability(
int val_count,
int count_all,
int val_size) {
624 return std::log((val_count + 1)/ (Probability)(count_all + val_size ));
631 template<
typename Val>
637 for(
typename AttrDomain::const_iterator ii = category.begin(); ii != category.end(); ++ii ) {
638 AttrIdd catVal = AttrDomain::getValueId(ii);
639 sumTraining += nb_train.getCategoryCounter(catVal);
643 int catSize = category.getSize();
644 for(
typename AttrDomain::const_iterator ii = category.begin(); ii != category.end(); ++ii ) {
645 AttrIdd catVal = AttrDomain::getValueId(ii);
646 int catCounter = nb_train.getCategoryCounter(catVal);
647 Probability catProb = calcProbability( catCounter, catSize, sumTraining );
649 const Domains& attribs = this->parent_->getAttrDomains();
650 for(
typename Domains::const_iterator jj = attribs.begin(); jj != attribs.end(); ++jj) {
651 const AttrDomain& attr = *jj;
652 int valSize = attr.getSize();
653 for(
typename AttrDomain::const_iterator kk = attr.begin(); kk != attr.end(); ++kk) {
654 AttrIdd val = AttrDomain::getValueId(kk);
655 Probability valProb = calcProbability( nb_train.getCategoryValCounter(catVal, val), valSize, catCounter );
656 counters.insert(
typename Counters::value_type(val, valProb) );
659 probabl_.insert(
typename InternalProbabilities::value_type(catVal, CategoryData<Probability>(catProb, counters) ) );
664 template<
typename Val>
666 Probability prob = getCategoryCounterLog(cat_val);
667 for(
typename ExampleTest::const_iterator ii = example.begin(); ii != example.end(); ++ii )
668 prob += getCategoryValCounterLog(cat_val, *ii );
673 template<
typename Val>
674 template<
class Archive>
676 ar << boost::serialization::make_nvp(
"Base", boost::serialization::base_object<NaiveBayesianTraining>(*
this));
677 ar << boost::serialization::make_nvp(
"InternalProb",probabl_);
681 template<
typename Val>
682 template<
class Archive>
684 ar >> boost::serialization::make_nvp(
"Base", boost::serialization::base_object<NaiveBayesianTraining>(*
this));
685 typedef std::map<AttrIddSerialize, CategoryData<Probability> > Map;
687 ar >> boost::serialization::make_nvp(
"InternalProb",m);
689 for(
typename Map::const_iterator ii = m.begin(); ii != m.end(); ++ii) {
690 probabl_.insert(
typename InternalProbabilities::value_type(ii->first, ii->second) );
697 #endif //FAIF_NAIVE_BAYESIAN_HPP_ Val::DomainType::ValueId AttrIdd
attribute id representation in learning
Definition: Classifier.hpp:55
void switchAddTraining(const ExampleTrain &example)
Definition: NaiveBayesian.hpp:376
virtual Beliefs getCategories(const ExampleTest &) const
classify and return all classes with belief that the example is from given class
Definition: NaiveBayesian.hpp:331
AttrIdd switchGetCategory(const ExampleTest &example)
Definition: NaiveBayesian.hpp:359
Val::Value AttrValue
attribute value representation in learning
Definition: Classifier.hpp:49
void trainIncremental(const ExampleTrain &)
Definition: NaiveBayesian.hpp:337
virtual void train(const ExamplesTrain &e)
learn classifier (on the collection of training examples)
Definition: NaiveBayesian.hpp:53
Naive Bayesian Classifier.
Definition: NaiveBayesian.hpp:33
class to show the classifier state, print the attribs and counters
Definition: NaiveBayesian.hpp:416
virtual void reset()
Definition: NaiveBayesian.hpp:317
friend class boost::serialization::access
serialization using boost::serialization
Definition: NaiveBayesian.hpp:82
virtual void write(std::ostream &os) const
Definition: Classifier.hpp:175
point and some feature
Definition: Point.hpp:58
const AttrDomain & getCategoryDomain() const
accessor
Definition: Classifier.hpp:112
Belief< Val >::Beliefs Beliefs
collection of pair (AttrIdd, Probability)
Definition: Classifier.hpp:64
inner class - examples train collection
Definition: Classifier.hpp:82
void switchLoadSaveState()
Definition: NaiveBayesian.hpp:383
virtual void write(std::ostream &os) const
Definition: NaiveBayesian.hpp:343
Point in n-space, each component of the same type.
Definition: Point.hpp:22
Val::DomainType::ValueIdSerialize AttrIddSerialize
for serialization the const interferes
Definition: Classifier.hpp:58
virtual AttrIdd getCategory(const ExampleTest &) const
Definition: NaiveBayesian.hpp:324
Beliefs switchGetCategories(const ExampleTest &example)
Definition: NaiveBayesian.hpp:368
Val::DomainType AttrDomain
the attribute domain for learning
Definition: Classifier.hpp:52
the clasiffier interface
Definition: Classifier.hpp:43
print the cauters for given category
Definition: NaiveBayesian.hpp:445