X-Boost  2.3.8
DecisionTree.h
1 /* XBoost: Ada-Boost and Friends on Haar/ICF/HOG Features, Library and ToolBox
2  *
3  * Copyright (c) 2008-2014 Paolo Medici <medici@ce.unipr.it>
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, write to the
17  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18  * Boston, MA 02111-1307, USA.
19  */
20 
21 #ifndef _ORACLE_DECISION_TREE_H
22 #define _ORACLE_DECISION_TREE_H
23 
28 #include <cstring>
31 #include "Thread/thread_group.h"
32 #include "Thread/bind.h"
33 #include "detail/BinaryClassifierOptimize.h"
34 
36 template<class DataType>
37 double BuildDecisionTree(IndirectDecisionTree<DataType> &root, const PrecomputedPatternResponse<DataType> & src, int max_depth, const std::vector<int> & indexes, int max_concurrent_jobs)
38 {
39  double bestw = 0.0;
40 
41  const int n_samples = indexes.size();
42  // precompute Wn, Wp
43  double wp = 0.0, wn = 0.0;
44  for(int i=0; i<n_samples; ++i)
45  {
46  int k = indexes[i];
47  if(src.category[k] == 1)
48  wp += src.weights[k];
49  else
50  wn -= src.weights[k];
51  }
52 
53 #ifdef verbose
54  std::cout << max_depth << "-- build decision tree with " << n_samples << " elements. wp:" << wp << ", wn:" << wn << ", wp+wn:" << wp+wn << std::endl;
55 #endif
56 
57  if(max_depth == 0)
58  {
59  if(wp>wn)
60  {
61 #ifdef verbose
62  std::cout << "\tmax depth reached: positive with " << wp << "\n";
63 #endif
64  root.category = 1;
65  return wp;
66  }
67  else
68  {
69 #ifdef verbose
70  std::cout << "\tmax depth reached: negative with " << wn << "\n";
71 #endif
72  root.category = -1;
73  return wn;
74  }
75  }
76 
77  // TODO: split using threads
78 
79  // temporary list used in optimizer
80  std::pair<DataType, double> * store = new std::pair<DataType, double> [n_samples];
81 
83  for(int i = 0; i<src.n_features; ++i)
84  {
85  // create Pattern Response
86  for(int j=0; j< n_samples; ++j)
87  {
88  int k = indexes[j];
89  store[j].first = src.response[i * src.n_samples + k];
90  store[j].second = src.weights[k]; /* (double) src.category[k] * src.weights[k]; */
91  }
92 
93  // sort
94  std::sort(&store[0], &store[n_samples]);
95 
97 
98  // evaluate feature: th and direction
99  std::pair<DataType, int> ret;
100 
101  Optimize(ret, store, n_samples, wp, wn, false);
102  // ret contain the best solution (threshold and diretion). Direction is not so important due to nature of decision tree
103 
104  // evaluate threshold
105  int left_size = 0;
106  double w_left = 0.0;
107 // double ewp = 0.0;
108 
109  for(int j=0; j< n_samples; ++j)
110  {
111 
112  int k = indexes[j];
113  // CORRECT SIDE
114  if(src.response[i * src.n_samples + k] < ret.first)
115  {
116  left_size++;
117  w_left += std::abs(src.weights[k]);
118  }
119  }
120 
121 #ifdef verbose
122  std::cout << "\t" << i << " | th: " << ret.first << ", polarity:" << ret.second << /*", w+:" << ewp << */ ", w:" << w << ", left:" << left_size << "(" << w_left << "), right:" << n_samples - left_size << "(" << wn+wp-w_left <<")"<< std::endl;
123 #endif
124 
125  if((left_size == 0) || (left_size == n_samples ))
126  {
127  // unusefull solution
128 #ifdef verbose
129  std::cout << "\t invalid solution" << std::endl;
130 #endif
131  }
132  else
133  {
134 
135  // BUILD PRECOMPUTE LEAVES
136  std::vector<int> left_resp, right_resp;
137 
138  left_resp.reserve(left_size);
139  right_resp.reserve(n_samples - left_size);
140 
141  // create leaves:
142  for(int j=0; j< n_samples; ++j)
143  {
144  int k = indexes[j];
145  if(src.response[i * src.n_samples + k] < ret.first)
146  {
147  left_resp.push_back(k);
148  }
149  else
150  {
151  right_resp.push_back(k);
152  }
153  }
154 
155  double w;
156 
159 
160 #ifdef verbose
161  std::cout << max_depth << "-- left --\n";
162 #endif
163  w = BuildDecisionTree(*left, src, max_depth-1, left_resp, 1);
164 #ifdef verbose
165  std::cout << max_depth << "-- right --\n";
166 #endif
167  w += BuildDecisionTree(*right, src, max_depth-1, right_resp, 1);
168 
169 #ifdef verbose
170  std::cout << max_depth << "-- w:" << w << std::endl;
171 #endif
172 
173  if(w > bestw)
174  {
175  delete root.left;
176  delete root.right;
177  root.left = left;
178  root.right = right;
179  root.th = ret.first;
180  root.classifier = i;
181  bestw = w;
182  }
183  else
184  {
185  // free unused memory
186  delete left;
187  delete right;
188  }
189 
190  }
191 
192  }
193 
194  delete [] store;
195 
196  if(bestw==0.0)
197  {
198 #ifdef verbose
199  std::cout << "no solution found:";
200 #endif
201  if(wp>wn)
202  {
203 #ifdef verbose
204  std::cout << "\tpositive with w:" << wp <<"\n";
205 #endif
206  root.category = 1;
207  return wp;
208  }
209  else
210  {
211 #ifdef verbose
212  std::cout << "\tnegative with w:" << wn <<"\n";
213 #endif
214  root.category = -1;
215  return wn;
216  }
217 
218 
219  }
220 
221 #ifdef verbose
222  std::cout << "solution found with w:" << bestw << std::endl;
223 #endif
224 
225  return bestw;
226 }
227 
228 namespace detail {
229 
230 template<class DataType>
231 void build_decision_tree_heuristic(double *outw, IndirectDecisionTree<DataType> * root, const PrecomputedPatternResponse<DataType> * src, int f0, int f1, int max_depth, const int * indexes, int n_samples, int histo_bin_size, float wp, float wn, bool optimize)
232 {
233  double bestw = 0.0;
234  // temporary list used in optimizer
235  float *store = new float[histo_bin_size];
236 
237  for(int i = f0; i< f1; ++i)
238  {
239  // reset and create histogram
240 
242 
243  for(int j = 0; j<histo_bin_size; ++j)
244  {
245  store[j] = 0.0;
246  }
247 
248  // create Pattern Response histogram
249  for(int j=0; j<n_samples; ++j)
250  {
251  int k = indexes[j];
252  // DataType v = src->GetResponse(k)[i];
253  int bin = src->bin[i * src->n_samples + k];
254  store[bin] += src->weights[k];
255  }
257 
258  // evaluate feature: th and direction
259  std::pair<DataType, int> ret;
260 
261  SimpleOptimize(ret, store, histo_bin_size, wp, wn);
262  // ret contain the best solution (threshold and diretion). Direction is not so important due to nature of decision tree
263 
264  // evaluate threshold
265  int left_size = 0;
266 
267  // SPLIT: precompute vector size in order to reserve memory
268  for(int j=0; j<n_samples; ++j)
269  {
270 
271  int k = indexes[j];
272  // CORRECT SIDE
273  if(src->bin[i * src->n_samples + k] < ret.first)
274  {
275  left_size++;
276 #ifdef verbose
277  w_left += std::abs(src.weights[k]);
278 #endif
279  }
280  }
281 
282  if((left_size == 0) || (left_size == n_samples ))
283  {
284  // unusefull solution
285  }
286  else
287  {
288 
290 
291  // BUILD PRECOMPUTE LEAVES
292  std::vector<int> left_resp, right_resp;
293 
294  left_resp.reserve(left_size);
295  right_resp.reserve(n_samples - left_size);
296 
297  // create leaves:
298  for(int j=0; j<n_samples; ++j)
299  {
300  int k = indexes[j];
301  if(src->bin[i * src->n_samples + k] < ret.first)
302  {
303  left_resp.push_back(k);
304  }
305  else
306  {
307  right_resp.push_back(k);
308  }
309  }
310 
312 
313  double w;
314 
317 
318  w = BuildDecisionTreeHeuristic(*left, *src, max_depth-1, left_resp, 1, histo_bin_size, optimize);
319  w += BuildDecisionTreeHeuristic(*right, *src, max_depth-1, right_resp, 1, histo_bin_size, optimize);
320 
321  if(w > bestw)
322  {
323  delete root->left;
324  delete root->right;
325  root->left = left;
326  root->right = right;
327 
329  DataType minr, maxr;
330 
331  minr = src->response_range[i].first;
332  maxr = src->response_range[i].second;
333 
334  DataType delta = (maxr > minr) ? (maxr - minr + 1) : (1);
335 
336  root->th = minr + ((ret.first * delta) / histo_bin_size); // ret.first;
337 
339 
340  root->classifier = i;
341  bestw = w;
342  }
343  else
344  {
345  // free unused memory
346  delete left;
347  delete right;
348  }
349 
350  }
351 
352  }
353 
354  delete [] store;
355 
356  *outw= bestw;
357 }
358 
359 }
360 
362 template<class DataType>
363 double BuildDecisionTreeHeuristic(IndirectDecisionTree<DataType> &root, const PrecomputedPatternResponse<DataType> & src, int max_depth, const std::vector<int> & indexes, int max_concurrent_jobs, int histo_bin_size, bool reoptimize)
364 {
365  double bestw = 0.0;
366 
367  const int n_samples = indexes.size();
368 
369  // precompute Wn, Wp
370  float wp = 0.0, wn = 0.0;
371  for(int i=0; i<n_samples; ++i)
372  {
373  int k = indexes[i];
374  if(src.category[k] == 1)
375  wp += src.weights[k];
376  else
377  wn -= src.weights[k];
378  }
379 
380 #ifdef verbose
381  std::cout << max_depth << "-- build decision tree with " << n_samples << " elements. wp:" << wp << ", wn:" << wn << ", wp+wn:" << wp+wn << std::endl;
382 #endif
383 
384  if(max_depth == 0)
385  {
386  if(wp>wn)
387  {
388 #ifdef verbose
389  std::cout << "\tmax depth reached: positive with " << wp << "\n";
390 #endif
391  root.category = 1;
392  return wp;
393  }
394  else
395  {
396 #ifdef verbose
397  std::cout << "\tmax depth reached: negative with " << wn << "\n";
398 #endif
399  root.category = -1;
400  return wn;
401  }
402  }
403 
404  // split using threads
405 
406  if(max_concurrent_jobs>1)
407  {
409 
410  IndirectDecisionTree<DataType> *candidates = new IndirectDecisionTree<DataType>[max_concurrent_jobs];
411  double *outw = new double[max_concurrent_jobs];
412 
413  for(int i =0; i<max_concurrent_jobs; ++i)
414  {
415  int f0 = (src.n_features * i) / max_concurrent_jobs;
416  int f1 = (src.n_features * (i+1) ) / max_concurrent_jobs;
417 
418  pool.create_thread(sprint::thread_bind(&detail::build_decision_tree_heuristic<DataType> , &outw[i], &candidates[i], &src, f0, f1, max_depth, &indexes[0], n_samples, histo_bin_size, wp, wn, reoptimize) );
419  }
420 
421  pool.join_all();
422 
423  for(int i=0; i<max_concurrent_jobs; ++i)
424  {
425  if(outw[i] > bestw)
426  {
427  bestw = outw[i];
428  root = candidates[i];
429  }
430  }
431 
432  delete [] outw;
433  delete [] candidates;
434  }
435  else
436  {
437  detail::build_decision_tree_heuristic(&bestw, &root, &src, 0, src.n_features, max_depth, &indexes[0], n_samples, histo_bin_size, wp, wn, reoptimize);
438  }
439 
440 
442 
443  if(bestw==0.0)
444  {
445 #ifdef verbose
446  std::cout << "no solution found:";
447 #endif
448  if(wp>wn)
449  {
450 #ifdef verbose
451  std::cout << "\tpositive with w:" << wp <<"\n";
452 #endif
453  root.category = 1;
454  return wp;
455  }
456  else
457  {
458 #ifdef verbose
459  std::cout << "\tnegative with w:" << wn <<"\n";
460 #endif
461  root.category = -1;
462  return wn;
463  }
464 
465 
466  }
467 
468 #ifdef verbose
469  std::cout << "solution found with w:" << bestw << std::endl;
470 #endif
471 
472  return bestw;
473 }
474 
475 // fw
476 template<class DataType>
477 double BuildDecisionTreeHeuristic_v2(IndirectDecisionTree<DataType> &root, const PrecomputedPatternResponse<DataType> & src, int max_depth, const std::vector<int> & indexes, int max_concurrent_jobs, int histo_bin_size, bool reoptimize);
478 
479 
480 namespace detail {
481 
482 template<class DataType>
483 void build_decision_tree_heuristic_v2(double *outw, IndirectDecisionTree<DataType> * root, const PrecomputedPatternResponse<DataType> * src, int f0, int f1, int max_depth, const int * indexes, int n_samples, int histo_bin_size, float wp, float wn, bool reoptimize)
484 {
485  double bestw = 0.0;
486  std::pair<DataType, int> best_ret;
487  int best_i = -1;
488  // temporary list used in optimizer
489  float *store = new float[histo_bin_size];
490 
491  // for each features
492  for(int i = f0; i< f1; ++i)
493  {
494  // reset and create histogram
495 
497 
498  // reset store:
499  for(int j = 0; j<histo_bin_size; ++j)
500  {
501  store[j] = 0.0f;
502  }
503 
504  // create Pattern Response histogram
505  for(int j=0; j<n_samples; ++j)
506  {
507  int k = indexes[j];
508  // DataType v = src->GetResponse(k)[i];
509  int bin = src->bin[i * src->n_samples + k];
510  store[bin] += src->weights[k];
511  }
513 
514  // evaluate feature: th and direction
515  std::pair<DataType, int> ret;
516 
517  double w = SimpleOptimize(ret, store, histo_bin_size, wp, wn);
518  // ret contain the best solution (threshold and diretion). Direction is not so important due to nature of decision tree
519 
520  if(w > bestw)
521  {
522  bestw = w;
523  best_ret = ret;
524  best_i = i;
525  }
526 
527  }
528  delete [] store;
529 
530  root->classifier = best_i;
531 
532  if(reoptimize)
533  {
534  // REOPTIMIZE
535  std::pair<DataType, float> * store = new std::pair<DataType, float> [n_samples];
536 
538 
539  // create Pattern Response
540  for(int j=0; j< n_samples; ++j)
541  {
542  int k = indexes[j];
543  store[j].first = src->response[best_i * src->n_samples + k];
544  store[j].second = src->weights[k]; /* (double) src.category[k] * src.weights[k]; */
545  }
546 
547  // sort
548  std::sort(&store[0], &store[n_samples]);
549 
551 
552  // evaluate feature: th and direction
553  Optimize(best_ret, store, n_samples, wp, wn, false);
554 
555  root->th = best_ret.first;
557 
558  delete [] store;
559 
560  }
561  else
562  {
564  DataType minr, maxr;
565 
566  minr = src->response_range[best_i].first;
567  maxr = src->response_range[best_i].second;
568 
569  DataType delta = (maxr > minr) ? (maxr - minr + 1) : (1);
570 
571  root->th = minr + ((best_ret.first * delta) / histo_bin_size); // ret.first;
573  }
574 
575 
576  // evaluate threshold
577  int left_size = 0;
578 
579  // SPLIT: precompute vector size in order to reserve memory
580  for(int j=0; j<n_samples; ++j)
581  {
582 
583  int k = indexes[j];
584  // CORRECT SIDE
585  if(src->response[best_i * src->n_samples + k] < root->th)
586  {
587  left_size++;
588 #ifdef verbose
589  w_left += std::abs(src.weights[k]);
590 #endif
591  }
592  }
593 
594  if((left_size == 0) || (left_size == n_samples ))
595  {
596  // unusefull solution
597  *outw = 0.0;
598  }
599  else
600  {
601 
603 
604  // BUILD PRECOMPUTE LEAVES
605  std::vector<int> left_resp, right_resp;
606 
607  left_resp.reserve(left_size);
608  right_resp.reserve(n_samples - left_size);
609 
610  // create leaves:
611  for(int j=0; j<n_samples; ++j)
612  {
613  int k = indexes[j];
614  if(src->response[best_i * src->n_samples + k] < root->th)
615  {
616  left_resp.push_back(k);
617  }
618  else
619  {
620  right_resp.push_back(k);
621  }
622  }
623 
625 
628 
629  double w;
630  w = BuildDecisionTreeHeuristic_v2(*left, *src, max_depth-1, left_resp, 1, histo_bin_size, reoptimize);
631  w += BuildDecisionTreeHeuristic_v2(*right, *src, max_depth-1, right_resp, 1, histo_bin_size, reoptimize);
632 
633  root->left = left;
634  root->right = right;
635 
636  *outw= w;
637  }
638 
639 }
640 
641 }
642 
643 
646 template<class DataType>
647 double BuildDecisionTreeHeuristic_v2(IndirectDecisionTree<DataType> &root, const PrecomputedPatternResponse<DataType> & src, int max_depth, const std::vector<int> & indexes, int max_concurrent_jobs, int histo_bin_size, bool reoptimize)
648 {
649  double bestw = 0.0;
650 
651  const int n_samples = indexes.size();
652 
653  // precompute Wn, Wp
654  float wp = 0.0, wn = 0.0;
655  for(int i=0; i<n_samples; ++i)
656  {
657  int k = indexes[i];
658  if(src.category[k] == 1)
659  wp += src.weights[k];
660  else
661  wn -= src.weights[k];
662  }
663 
664  if(max_depth == 0)
665  {
666  if(wp>wn)
667  {
668  root.category = 1;
669  return wp;
670  }
671  else
672  {
673  root.category = -1;
674  return wn;
675  }
676  }
677 
678  // split using threads
679 
680  if(max_concurrent_jobs>1)
681  {
683 
684  IndirectDecisionTree<DataType> *candidates = new IndirectDecisionTree<DataType>[max_concurrent_jobs];
685  double *outw = new double[max_concurrent_jobs];
686 
687  for(int i =0; i<max_concurrent_jobs; ++i)
688  {
689  int f0 = (src.n_features * i) / max_concurrent_jobs;
690  int f1 = (src.n_features * (i+1) ) / max_concurrent_jobs;
691 
692  pool.create_thread(sprint::thread_bind(&detail::build_decision_tree_heuristic_v2<DataType> , &outw[i], &candidates[i], &src, f0, f1, max_depth, &indexes[0], n_samples, histo_bin_size, wp, wn, reoptimize) );
693  }
694 
695  pool.join_all();
696 
697  for(int i=0; i<max_concurrent_jobs; ++i)
698  {
699  if(outw[i] > bestw)
700  {
701  bestw = outw[i];
702  root = candidates[i];
703  }
704  }
705 
706  delete [] outw;
707  delete [] candidates;
708  }
709  else
710  {
711  detail::build_decision_tree_heuristic_v2(&bestw, &root, &src, 0, src.n_features, max_depth, &indexes[0], n_samples, histo_bin_size, wp, wn, reoptimize);
712  }
713 
714 
716 
717  if(bestw==0.0)
718  {
719  if(wp>wn)
720  {
721  root.category = 1;
722  return wp;
723  }
724  else
725  {
726  root.category = -1;
727  return wn;
728  }
729 
730 
731  }
732 
733  return bestw;
734 }
735 
736 #endif
int * category
category list
Definition: PrecomputedPatternResponse.h:48
IndirectDecisionTree< DataType > * left
sub-nodes (if null is a leaf)
Definition: IndirectDecisionTree.h:45
this header contains PrecomputedPatternResponse class, used to store hash information provided by a m...
double * weights
weights list
Definition: PrecomputedPatternResponse.h:50
int category
if not a node, a category
Definition: IndirectDecisionTree.h:42
bool BuildDecisionTree(DecisionTree< FeatureExtractor > &root, const std::vector< PatternType > &patterns, Param1 param, FeatureGenerator &f, int max_depth)
Definition: DecisionTreeLearner.h:172
int * bin
precomputed bin matrix (rows features, columns samples)
Definition: PrecomputedPatternResponse.h:56
Definition: thread_group.h:82
void join_all()
wait all threads terminate
Definition: thread_group.h:114
proposal 1 for thread group
int n_features
number of feature preallocated
Definition: PrecomputedPatternResponse.h:46
int classifier
index to a Binary Classifier
Definition: IndirectDecisionTree.h:37
int n_samples
number of samples
Definition: PrecomputedPatternResponse.h:44
Definition: PrecomputedPatternResponse.h:41
method to create function pointer for thread call
std::pair< ResponseType, ResponseType > * response_range
precomputed response ranges
Definition: PrecomputedPatternResponse.h:54
bool create_thread(const sprint::thread_function &p)
create an additional thread
Definition: thread_group.h:102
ResponseType * response
response matrix. For performance reason it each row is a feature and columns are samples ...
Definition: PrecomputedPatternResponse.h:52
implement a decision tree with feature stored in a map
DataType th
Classifier Threshold.
Definition: IndirectDecisionTree.h:39
Definition: IndirectDecisionTree.h:33