mlpack
r_star_tree_split_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_CORE_TREE_RECTANGLE_TREE_R_STAR_TREE_SPLIT_IMPL_HPP
13 #define MLPACK_CORE_TREE_RECTANGLE_TREE_R_STAR_TREE_SPLIT_IMPL_HPP
14 
15 #include "r_star_tree_split.hpp"
16 #include "rectangle_tree.hpp"
17 
19 
20 namespace mlpack {
21 namespace tree {
22 
27 template<typename TreeType>
28 size_t RStarTreeSplit::ReinsertPoints(TreeType* tree,
29  std::vector<bool>& relevels)
30 {
31  // Convenience typedef.
32  typedef typename TreeType::ElemType ElemType;
33 
34  // Check if we need to reinsert.
35  if (relevels[tree->TreeDepth() - 1])
36  {
37  relevels[tree->TreeDepth() - 1] = false;
38 
39  // We sort the points by decreasing distance to the centroid of the bound.
40  // We then remove the first p entries and reinsert them at the root.
41  TreeType* root = tree;
42  while (root->Parent() != NULL)
43  root = root->Parent();
44  size_t p = tree->MaxLeafSize() * 0.3; // The paper says this works the best.
45  if (p > 0)
46  {
47  // We'll only do reinsertions if p > 0. p is the number of points we will
48  // reinsert. If p == 0, then no reinsertion is necessary and we continue
49  // with the splitting procedure.
50  std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
51  arma::Col<ElemType> center;
52  tree->Bound().Center(center);
53  for (size_t i = 0; i < sorted.size(); ++i)
54  {
55  sorted[i].first = tree->Metric().Evaluate(center,
56  tree->Dataset().col(tree->Point(i)));
57  sorted[i].second = tree->Point(i);
58  }
59 
60  std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
61 
62  // Remove the points furthest from the center of the node.
63  for (size_t i = 0; i < p; ++i)
64  root->DeletePoint(sorted[sorted.size() - 1 - i].second, relevels);
65 
66  // Now reinsert the points, but reverse the order---insert the closest to
67  // the center first.
68  for (size_t i = p; i > 0; --i)
69  root->InsertPoint(sorted[sorted.size() - i].second, relevels);
70  }
71 
72  return p;
73  }
74 
75  return 0;
76 }
77 
81 template<typename TreeType>
82 void RStarTreeSplit::PickLeafSplit(TreeType* tree,
83  size_t& bestAxis,
84  size_t& bestIndex)
85 {
86  // Convenience typedef.
87  typedef typename TreeType::ElemType ElemType;
89 
90  bestAxis = 0;
91  bestIndex = 0;
92  ElemType bestScore = std::numeric_limits<ElemType>::max();
93 
97  for (size_t j = 0; j < tree->Bound().Dim(); ++j)
98  {
99  ElemType axisScore = 0.0;
100 
101  // Sort in increasing values of the selected dimension j.
102  arma::Col<ElemType> dimValues(tree->Count());
103  for (size_t i = 0; i < tree->Count(); ++i)
104  dimValues[i] = tree->Dataset().col(tree->Point(i))[j];
105  arma::uvec sortedIndices = arma::sort_index(dimValues);
106 
107  // We'll store each of the three scores for each distribution.
108  const size_t numPossibleSplits = tree->MaxLeafSize() -
109  2 * tree->MinLeafSize() + 2;
110  arma::Col<ElemType> areas(numPossibleSplits, arma::fill::zeros);
111  arma::Col<ElemType> margins(numPossibleSplits, arma::fill::zeros);
112  arma::Col<ElemType> overlaps(numPossibleSplits, arma::fill::zeros);
113 
114  for (size_t i = 0; i < numPossibleSplits; ++i)
115  {
116  // The ith arrangement is obtained by placing the first
117  // tree->MinLeafSize() + i points in one rectangle and the rest in
118  // another. Then we calculate the three scores for that distribution.
119  size_t splitIndex = tree->MinLeafSize() + i;
120 
121  BoundType bound1(tree->Bound().Dim());
122  BoundType bound2(tree->Bound().Dim());
123 
124  for (size_t l = 0; l < splitIndex; l++)
125  bound1 |= tree->Dataset().col(tree->Point(sortedIndices[l]));
126 
127  for (size_t l = splitIndex; l < tree->Count(); l++)
128  bound2 |= tree->Dataset().col(tree->Point(sortedIndices[l]));
129 
130  areas[i] = bound1.Volume() + bound2.Volume();
131  overlaps[i] = bound1.Overlap(bound2);
132 
133  for (size_t k = 0; k < bound1.Dim(); ++k)
134  margins[i] += bound1[k].Width() + bound2[k].Width();
135 
136  axisScore += margins[i];
137  }
138 
139  // Is this dimension a new best score? We want the lowest possible score.
140  if (axisScore < bestScore)
141  {
142  bestScore = axisScore;
143  bestAxis = j;
144  size_t overlapIndex = 0;
145  size_t areaIndex = 0;
146  bool tiedOnOverlap = false;
147 
148  for (size_t i = 1; i < areas.n_elem; ++i)
149  {
150  if (overlaps[i] < overlaps[overlapIndex])
151  {
152  tiedOnOverlap = false;
153  overlapIndex = i;
154  areaIndex = i;
155  }
156  else if (overlaps[i] == overlaps[overlapIndex])
157  {
158  tiedOnOverlap = true;
159  if (areas[i] < areas[areaIndex])
160  areaIndex = i;
161  }
162  }
163 
164  // Select the best index for splitting.
165  bestIndex = (tiedOnOverlap ? areaIndex : overlapIndex);
166  }
167  }
168 }
169 
176 template<typename TreeType>
177 void RStarTreeSplit::SplitLeafNode(TreeType *tree, std::vector<bool>& relevels)
178 {
179  // Convenience typedef.
180  typedef typename TreeType::ElemType ElemType;
181 
182  // If there's no need to split, don't.
183  if (tree->Count() <= tree->MaxLeafSize())
184  return;
185 
186  // If we haven't yet checked if we need to reinsert on this level, we try
187  // doing so now.
188  if (ReinsertPoints(tree, relevels) > 0)
189  return;
190 
191  // We don't need to reinsert. Instead, we need to split the node.
192  size_t bestAxis;
193  size_t bestIndex;
194  PickLeafSplit(tree, bestAxis, bestIndex);
195 
200  std::vector<std::pair<ElemType, size_t>> sorted(tree->Count());
201  for (size_t i = 0; i < sorted.size(); ++i)
202  {
203  sorted[i].first = tree->Dataset().col(tree->Point(i))[bestAxis];
204  sorted[i].second = tree->Point(i);
205  }
206 
207  std::sort(sorted.begin(), sorted.end(), PairComp<ElemType, size_t>);
208 
220  TreeType* par = tree->Parent();
221  TreeType* treeOne = (par) ? tree : new TreeType(tree);
222  TreeType* treeTwo = (par) ? new TreeType(par) : new TreeType(tree);
223 
224  // Now clean the node, and we will re-use this.
225  const size_t numPoints = tree->Count();
226 
227  // Reset the original node's values, regardless of whether it will become
228  // the new parent or not.
229  tree->numChildren = 0;
230  tree->numDescendants = 0;
231  tree->count = 0;
232  tree->bound.Clear();
233 
234  // Insert the points into the appropriate tree.
235  for (size_t i = 0; i < numPoints; ++i)
236  {
237  if (i < bestIndex + tree->MinLeafSize())
238  treeOne->InsertPoint(sorted[i].second);
239  else
240  treeTwo->InsertPoint(sorted[i].second);
241  }
242 
243  // Insert the new tree node(s).
244  if (par)
245  {
246  // Just insert the new node into the parent.
247  par->children[par->NumChildren()++] = treeTwo;
248 
249  // If we have overflowed the parent's children, then we need to split that
250  // node also.
251  if (par->NumChildren() == par->MaxNumChildren() + 1)
252  RStarTreeSplit::SplitNonLeafNode(par, relevels);
253  }
254  else
255  {
256  // Now insert the two nodes into 'tree', which is now a higher-level root
257  // node in the tree.
258  InsertNodeIntoTree(tree, treeOne);
259  InsertNodeIntoTree(tree, treeTwo);
260  }
261 }
262 
270 template<typename TreeType>
272  TreeType *tree,
273  std::vector<bool>& relevels)
274 {
275  // Convenience typedef.
276  typedef typename TreeType::ElemType ElemType;
278 
279  // Reinsertion isn't done for non-leaf nodes; the paper doesn't seem to make
280  // it clear how to reinsert an entire node without reinserting each of the
281  // points, so we will avoid that here. This is a possible area for
282  // improvement of this code.
283 
284  size_t bestAxis = 0;
285  size_t bestIndex = 0;
286  ElemType bestScore = std::numeric_limits<ElemType>::max();
287  bool lowIsBetter = true;
288 
292  for (size_t j = 0; j < tree->Bound().Dim(); ++j)
293  {
294  ElemType axisLoScore = 0.0;
295  ElemType axisHiScore = 0.0;
296 
297  // We have to calculate values for both the lower and higher parts of the
298  // bound.
299  arma::Col<ElemType> loDimValues(tree->NumChildren());
300  arma::Col<ElemType> hiDimValues(tree->NumChildren());
301  for (size_t i = 0; i < tree->NumChildren(); ++i)
302  {
303  loDimValues[i] = tree->Child(i).Bound()[j].Lo();
304  hiDimValues[i] = tree->Child(i).Bound()[j].Hi();
305  }
306  arma::uvec sortedLoDimIndices = arma::sort_index(loDimValues);
307  arma::uvec sortedHiDimIndices = arma::sort_index(hiDimValues);
308 
309  // We'll store each of the three scores for each distribution. Remember
310  // that these are the sums calculated over both the low and high bounds of
311  // each rectangle.
312  const size_t numPossibleSplits = tree->MaxNumChildren() -
313  2 * tree->MinNumChildren() + 2;
314  arma::Col<ElemType> areas(2 * numPossibleSplits, arma::fill::zeros);
315  arma::Col<ElemType> margins(2 * numPossibleSplits, arma::fill::zeros);
316  arma::Col<ElemType> overlaps(2 * numPossibleSplits, arma::fill::zeros);
317 
318  for (size_t i = 0; i < numPossibleSplits; ++i)
319  {
320  // The ith arrangement is obtained by placing the first
321  // tree->MinNumChildren() + i points in one rectangle and the rest in
322  // another. Then we calculate the three scores for that distribution.
323  const size_t splitIndex = tree->MinNumChildren() + i;
324 
325  BoundType lb1(tree->Bound().Dim());
326  BoundType lb2(tree->Bound().Dim());
327  BoundType hb1(tree->Bound().Dim());
328  BoundType hb2(tree->Bound().Dim());
329 
330  for (size_t l = 0; l < splitIndex; ++l)
331  {
332  lb1 |= tree->Child(sortedLoDimIndices[l]).Bound();
333  hb1 |= tree->Child(sortedHiDimIndices[l]).Bound();
334  }
335  for (size_t l = splitIndex; l < tree->NumChildren(); ++l)
336  {
337  lb2 |= tree->Child(sortedLoDimIndices[l]).Bound();
338  hb2 |= tree->Child(sortedHiDimIndices[l]).Bound();
339  }
340 
341  // Calculate low bound distributions.
342  areas[2 * i] = lb1.Volume() + lb2.Volume();
343  overlaps[2 * i] = lb1.Overlap(lb2);
344 
345  // Calculate high bound distributions.
346  areas[2 * i + 1] = hb1.Volume() + hb2.Volume();
347  overlaps[2 * i + 1] = hb1.Overlap(hb2);
348 
349  // Now calculate margins for each.
350  for (size_t k = 0; k < lb1.Dim(); ++k)
351  {
352  margins[2 * i] += lb1[k].Width() + lb2[k].Width();
353  margins[2 * i + 1] += hb1[k].Width() + hb2[k].Width();
354  }
355 
356  // The score we use is the sum of all scores.
357  axisLoScore += margins[2 * i];
358  axisHiScore += margins[2 * i + 1];
359  }
360 
361  // If this dimension's score (for lower or higher bound scores) is a new
362  // best, then extract the necessary split information.
363  if (std::min(axisLoScore, axisHiScore) < bestScore)
364  {
365  bestScore = std::min(axisLoScore, axisHiScore);
366  if (axisLoScore < axisHiScore)
367  lowIsBetter = true;
368  else
369  lowIsBetter = false;
370 
371  bestAxis = j;
372 
373  // This will get us either the lower or higher bound depending on which we
374  // want. Remember that we selected *either* the lower or higher bounds to
375  // split on, so we want to only check those.
376  const size_t indexOffset = lowIsBetter ? 0 : 1;
377 
378  size_t overlapIndex = indexOffset;
379  size_t areaIndex = indexOffset;
380  bool tiedOnOverlap = false;
381 
382  // Find the best possible split (and whether it is on the low values or
383  // high values of the bounds).
384  for (size_t i = 1; i < numPossibleSplits; ++i)
385  {
386  // Check bounds.
387  if (overlaps[2 * i + indexOffset] < overlaps[overlapIndex])
388  {
389  tiedOnOverlap = false;
390  areaIndex = 2 * i + indexOffset;
391  overlapIndex = 2 * i + indexOffset;
392  }
393  else if (overlaps[i] == overlaps[overlapIndex])
394  {
395  tiedOnOverlap = true;
396  if (areas[2 * i + indexOffset] < areas[areaIndex])
397  areaIndex = 2 * i + indexOffset;
398  }
399  }
400 
401  bestIndex = ((tiedOnOverlap ? areaIndex : overlapIndex) - indexOffset) / 2
402  + tree->MinNumChildren();
403  }
404  }
405 
406  // Get a list of the old children.
407  std::vector<TreeType*> oldChildren(tree->NumChildren());
408  for (size_t i = 0; i < oldChildren.size(); ++i)
409  oldChildren[i] = &tree->Child(i);
410 
422  TreeType* par = tree->Parent();
423  TreeType* treeOne = par ? tree : new TreeType(tree);
424  TreeType* treeTwo = par ? new TreeType(par) : new TreeType(tree);
425 
426  // Now clean the node.
427  tree->numChildren = 0;
428  tree->numDescendants = 0;
429  tree->count = 0;
430  tree->bound.Clear();
431 
432  // Assemble vector of values.
433  arma::Col<ElemType> values(oldChildren.size());
434  for (size_t i = 0; i < oldChildren.size(); ++i)
435  {
436  values[i] = (lowIsBetter ? oldChildren[i]->Bound()[bestAxis].Lo()
437  : oldChildren[i]->Bound()[bestAxis].Hi());
438  }
439  arma::uvec indices = arma::sort_index(values);
440 
441  for (size_t i = 0; i < bestIndex; ++i)
442  InsertNodeIntoTree(treeOne, oldChildren[indices[i]]);
443  for (size_t i = bestIndex; i < oldChildren.size(); ++i)
444  InsertNodeIntoTree(treeTwo, oldChildren[indices[i]]);
445 
446  // Insert the new tree node(s).
447  if (par)
448  {
449  // Insert the new node into the parent. The number of descendants does not
450  // need to be updated.
451  par->children[par->NumChildren()++] = treeTwo;
452  }
453  else
454  {
455  // Insert both nodes into 'tree', which is now a higher-level root node.
456  InsertNodeIntoTree(tree, treeOne);
457  InsertNodeIntoTree(tree, treeTwo);
458 
459  // We have to update the children of treeOne so that they record the correct
460  // parent.
461  for (size_t i = 0; i < treeOne->NumChildren(); ++i)
462  treeOne->children[i]->Parent() = treeOne;
463  }
464 
465  // Update the children of treeTwo to have the correct parent.
466  for (size_t i = 0; i < treeTwo->NumChildren(); ++i)
467  treeTwo->children[i]->Parent() = treeTwo;
468 
469  // If we have overflowed hte parent's children, then we need to split that
470  // node also.
471  if (par && par->NumChildren() >= par->MaxNumChildren() + 1)
472  RStarTreeSplit::SplitNonLeafNode(par, relevels);
473 
474  return false;
475 }
476 
481 template<typename TreeType>
482 void RStarTreeSplit::InsertNodeIntoTree(TreeType* destTree, TreeType* srcNode)
483 {
484  destTree->Bound() |= srcNode->Bound();
485  destTree->numDescendants += srcNode->numDescendants;
486  destTree->children[destTree->NumChildren()++] = srcNode;
487 }
488 
489 } // namespace tree
490 } // namespace mlpack
491 
492 #endif
static bool SplitNonLeafNode(TreeType *tree, std::vector< bool > &relevels)
Split a non-leaf node using the "default" algorithm.
Definition: r_star_tree_split_impl.hpp:271
static void SplitLeafNode(TreeType *tree, std::vector< bool > &relevels)
Split a leaf node using the algorithm described in "The R*-tree: An Efficient and Robust Access metho...
Definition: r_star_tree_split_impl.hpp:177
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
static void PickLeafSplit(TreeType *tree, size_t &bestAxis, size_t &bestIndex)
Given a node, return the best dimension and the best index to split on.
Definition: r_star_tree_split_impl.hpp:82
static size_t ReinsertPoints(TreeType *tree, std::vector< bool > &relevels)
Reinsert any points into the tree, if needed.
Definition: r_star_tree_split_impl.hpp:28
Definition of the Range class, which represents a simple range with a lower and upper bound...