mlpack
cf_impl.hpp
Go to the documentation of this file.
1 
16 #ifndef MLPACK_METHODS_CF_CF_IMPL_HPP
17 #define MLPACK_METHODS_CF_CF_IMPL_HPP
18 
19 // In case it hasn't been included yet.
20 #include "cf.hpp"
21 
22 namespace mlpack {
23 namespace cf {
24 
25 // Default CF constructor.
26 template<typename DecompositionPolicy,
27  typename NormalizationType>
28 CFType<DecompositionPolicy,
29  NormalizationType>::
30 CFType(const size_t numUsersForSimilarity,
31  const size_t rank) :
32  numUsersForSimilarity(numUsersForSimilarity),
33  rank(rank)
34 {
35  // Validate neighbourhood size.
36  if (numUsersForSimilarity < 1)
37  {
38  Log::Warn << "CFType::CFType(): neighbourhood size should be > 0 ("
39  << numUsersForSimilarity << " given). Setting value to 5.\n";
40  // Set default value of 5.
41  this->numUsersForSimilarity = 5;
42  }
43 }
44 
48 template<typename DecompositionPolicy,
49  typename NormalizationType>
50 template<typename MatType>
51 CFType<DecompositionPolicy,
52  NormalizationType>::
53 CFType(const MatType& data,
54  const DecompositionPolicy& decomposition,
55  const size_t numUsersForSimilarity,
56  const size_t rank,
57  const size_t maxIterations,
58  const double minResidue,
59  const bool mit) :
60  numUsersForSimilarity(numUsersForSimilarity),
61  rank(rank)
62 {
63  // Validate neighbourhood size.
64  if (numUsersForSimilarity < 1)
65  {
66  Log::Warn << "CFType::CFType(): neighbourhood size should be > 0 ("
67  << numUsersForSimilarity << " given). Setting value to 5.\n";
68  // Set default value of 5.
69  this->numUsersForSimilarity = 5;
70  }
71 
72  Train(data, decomposition, maxIterations, minResidue, mit);
73 }
74 
75 // Train when data is given in dense matrix form.
76 template<typename DecompositionPolicy,
77  typename NormalizationType>
78 void CFType<DecompositionPolicy,
79  NormalizationType>::
80 Train(const arma::mat& data,
81  const DecompositionPolicy& decomposition,
82  const size_t maxIterations,
83  const double minResidue,
84  const bool mit)
85 {
86  this->decomposition = decomposition;
87 
88  // Make a copy of data before performing normalization.
89  arma::mat normalizedData(data);
90  normalization.Normalize(normalizedData);
91  CleanData(normalizedData, cleanedData);
92 
93  // Check if the user wanted us to choose a rank for them.
94  if (rank == 0)
95  {
96  // This is a simple heuristic that picks a rank based on the density of the
97  // dataset between 5 and 105.
98  const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
99  const size_t rankEstimate = size_t(density) + 5;
100 
101  // Set to heuristic value.
102  Log::Info << "No rank given for decomposition; using rank of "
103  << rankEstimate << " calculated by density-based heuristic."
104  << std::endl;
105  this->rank = rankEstimate;
106  }
107 
108  // Decompose the data matrix (which is in coordinate list form) to user and
109  // data matrices.
110  Timer::Start("cf_factorization");
111  this->decomposition.Apply(
112  normalizedData, cleanedData, rank, maxIterations, minResidue, mit);
113  Timer::Stop("cf_factorization");
114 }
115 
116 // Train when data is given as sparse matrix of user item table.
117 template<typename DecompositionPolicy,
118  typename NormalizationType>
119 void CFType<DecompositionPolicy,
120  NormalizationType>::
121 Train(const arma::sp_mat& data,
122  const DecompositionPolicy& decomposition,
123  const size_t maxIterations,
124  const double minResidue,
125  const bool mit)
126 {
127  this->decomposition = decomposition;
128 
129  // data is not used in the following decomposition.Apply() method, so we only
130  // need to Normalize cleanedData.
131  cleanedData = data;
132  normalization.Normalize(cleanedData);
133 
134  // Check if the user wanted us to choose a rank for them.
135  if (rank == 0)
136  {
137  // This is a simple heuristic that picks a rank based on the density of the
138  // dataset between 5 and 105.
139  const double density = (cleanedData.n_nonzero * 100.0) / cleanedData.n_elem;
140  const size_t rankEstimate = size_t(density) + 5;
141 
142  // Set to heuristic value.
143  Log::Info << "No rank given for decomposition; using rank of "
144  << rankEstimate << " calculated by density-based heuristic."
145  << std::endl;
146  this->rank = rankEstimate;
147  }
148 
149  // Decompose the data matrix (which is in coordinate list form) to user and
150  // data matrices.
151  Timer::Start("cf_factorization");
152  this->decomposition.Apply(
153  data, cleanedData, rank, maxIterations, minResidue, mit);
154  Timer::Stop("cf_factorization");
155 }
156 
157 template<typename DecompositionPolicy,
158  typename NormalizationType>
159 template<typename NeighborSearchPolicy,
160  typename InterpolationPolicy>
161 void CFType<DecompositionPolicy,
162  NormalizationType>::
163 GetRecommendations(const size_t numRecs,
164  arma::Mat<size_t>& recommendations)
165 {
166  // Generate list of users. Maybe it would be more efficient to pass an empty
167  // users list, and then have the other overload of GetRecommendations() assume
168  // that if users is empty, then recommendations should be generated for all
169  // users?
170  arma::Col<size_t> users = arma::linspace<arma::Col<size_t> >(0,
171  cleanedData.n_cols - 1, cleanedData.n_cols);
172 
173  // Call the main overload for recommendations.
174  GetRecommendations<NeighborSearchPolicy,
175  InterpolationPolicy>(numRecs, recommendations, users);
176 }
177 
178 template<typename DecompositionPolicy,
179  typename NormalizationType>
180 template<typename NeighborSearchPolicy,
181  typename InterpolationPolicy>
182 void CFType<DecompositionPolicy,
183  NormalizationType>::
184 GetRecommendations(const size_t numRecs,
185  arma::Mat<size_t>& recommendations,
186  const arma::Col<size_t>& users)
187 {
188  // Temporary storage for neighborhood of the queried users.
189  arma::Mat<size_t> neighborhood;
190  // Resulting similarities.
191  arma::mat similarities;
192 
193  // Calculate the neighborhood of the queried users. Note that the query user
194  // is part of the neighborhood---this is intentional. We want to use the
195  // weighted sum of both the query user and the local neighborhood of the
196  // query user.
197  // Calculate the neighborhood of the queried users.
198  decomposition.template GetNeighborhood<NeighborSearchPolicy>(
199  users, numUsersForSimilarity, neighborhood, similarities);
200 
201  // Generate recommendations for each query user by finding the maximum numRecs
202  // elements in the ratings vector.
203  recommendations.set_size(numRecs, users.n_elem);
204  arma::mat values(numRecs, users.n_elem);
205  recommendations.fill(SIZE_MAX);
206  values.fill(DBL_MAX);
207 
208  // Initialization of an InterpolationPolicy object should be put ahead of the
209  // following loop, because the initialization may takes a relatively long
210  // time and we don't want to repeat the initialization process in each loop.
211  InterpolationPolicy interpolation(cleanedData);
212 
213  for (size_t i = 0; i < users.n_elem; ++i)
214  {
215  // First, calculate the weighted sum of neighborhood values.
216  arma::vec ratings;
217  ratings.zeros(cleanedData.n_rows);
218 
219  // Calculate interpolation weights.
220  arma::vec weights(numUsersForSimilarity);
221  interpolation.GetWeights(weights, decomposition, users(i),
222  neighborhood.col(i), similarities.col(i), cleanedData);
223 
224  for (size_t j = 0; j < neighborhood.n_rows; ++j)
225  {
226  arma::vec neighborRatings;
227  decomposition.GetRatingOfUser(neighborhood(j, i), neighborRatings);
228  ratings += weights(j) * neighborRatings;
229  }
230 
231  // Let's build the list of candidate recomendations for the given user.
232  // Default candidate: the smallest possible value and invalid item number.
233  const Candidate def = std::make_pair(-DBL_MAX, cleanedData.n_rows);
234  std::vector<Candidate> vect(numRecs, def);
235  typedef std::priority_queue<Candidate, std::vector<Candidate>, CandidateCmp>
236  CandidateList;
237  CandidateList pqueue(CandidateCmp(), std::move(vect));
238 
239  // Look through the ratings column corresponding to the current user.
240  for (size_t j = 0; j < ratings.n_rows; ++j)
241  {
242  // Ensure that the user hasn't already rated the item.
243  // The algorithm omits rating of zero. Thus, when normalizing original
244  // ratings in Normalize(), if normalized rating equals zero, it is set
245  // to the smallest positive double value.
246  if (cleanedData(j, users(i)) != 0.0)
247  continue; // The user already rated the item.
248 
249  // Is the estimated value better than the worst candidate?
250  // Denormalize rating before comparison.
251  double realRating = normalization.Denormalize(users(i), j, ratings[j]);
252  if (realRating > pqueue.top().first)
253  {
254  Candidate c = std::make_pair(realRating, j);
255  pqueue.pop();
256  pqueue.push(c);
257  }
258  }
259 
260  for (size_t p = 1; p <= numRecs; p++)
261  {
262  recommendations(numRecs - p, i) = pqueue.top().second;
263  values(numRecs - p, i) = pqueue.top().first;
264  pqueue.pop();
265  }
266 
267  // If we were not able to come up with enough recommendations, issue a
268  // warning.
269  if (recommendations(numRecs - 1, i) == def.second)
270  Log::Warn << "Could not provide " << numRecs << " recommendations "
271  << "for user " << users(i) << " (not enough un-rated items)!"
272  << std::endl;
273  }
274 }
275 
276 // Predict the rating for a single user/item combination.
277 template<typename DecompositionPolicy,
278  typename NormalizationType>
279 template<typename NeighborSearchPolicy,
280  typename InterpolationPolicy>
281 double CFType<DecompositionPolicy,
282  NormalizationType>::
283 Predict(const size_t user, const size_t item) const
284 {
285  // First, we need to find the nearest neighbors of the given user.
286  // We'll use the same technique as for GetRecommendations().
287 
288  // Temporary storage for neighborhood of the queried users.
289  arma::Mat<size_t> neighborhood;
290  // Resulting similarities.
291  arma::mat similarities;
292 
293  // Calculate the neighborhood of the queried users. Note that the query user
294  // is part of the neighborhood---this is intentional. We want to use the
295  // weighted sum of both the query user and the local neighborhood of the
296  // query user.
297  // Calculate the neighborhood of the queried users.
298  arma::Col<size_t> users(1);
299  users(0) = user;
300  decomposition.template GetNeighborhood<NeighborSearchPolicy>(
301  users, numUsersForSimilarity, neighborhood, similarities);
302 
303  arma::vec weights(numUsersForSimilarity);
304 
305  // Calculate interpolation weights.
306  InterpolationPolicy interpolation(cleanedData);
307  interpolation.GetWeights(weights, decomposition, user,
308  neighborhood.col(0), similarities.col(0), cleanedData);
309 
310  double rating = 0; // We'll take the weighted sum of neighborhood values.
311 
312  for (size_t j = 0; j < neighborhood.n_rows; ++j)
313  rating += weights(j) * decomposition.GetRating(neighborhood(j, 0), item);
314 
315  // Denormalize rating and return.
316  double realRating = normalization.Denormalize(user, item, rating);
317  return realRating;
318 }
319 
320 // Predict the rating for a group of user/item combinations.
321 template<typename DecompositionPolicy,
322  typename NormalizationType>
323 template<typename NeighborSearchPolicy,
324  typename InterpolationPolicy>
325 void CFType<DecompositionPolicy,
326  NormalizationType>::
327 Predict(const arma::Mat<size_t>& combinations,
328  arma::vec& predictions) const
329 {
330  // Now, we must determine those query indices we need to find the nearest
331  // neighbors for. This is easiest if we just sort the combinations matrix.
332  arma::Mat<size_t> sortedCombinations(combinations.n_rows,
333  combinations.n_cols);
334  arma::uvec ordering = arma::sort_index(combinations.row(0).t());
335  for (size_t i = 0; i < ordering.n_elem; ++i)
336  sortedCombinations.col(i) = combinations.col(ordering[i]);
337 
338  // Now, we have to get the list of unique users we will be searching for.
339  arma::Col<size_t> users = arma::unique(combinations.row(0).t());
340 
341  // Temporary storage for neighborhood of the queried users.
342  arma::Mat<size_t> neighborhood;
343  // Resulting similarities.
344  arma::mat similarities;
345 
346  // Calculate the neighborhood of the queried users. Note that the query user
347  // is part of the neighborhood---this is intentional. We want to use the
348  // weighted sum of both the query user and the local neighborhood of the
349  // query user.
350  // Calculate the neighborhood of the queried users.
351  decomposition.template GetNeighborhood<NeighborSearchPolicy>(
352  users, numUsersForSimilarity, neighborhood, similarities);
353 
354  arma::mat weights(numUsersForSimilarity, users.n_elem);
355 
356  // Calculate interpolation weights.
357  InterpolationPolicy interpolation(cleanedData);
358  for (size_t i = 0; i < users.n_elem; ++i)
359  {
360  interpolation.GetWeights(weights.col(i), decomposition, users[i],
361  neighborhood.col(i), similarities.col(i), cleanedData);
362  }
363 
364  // Now that we have the neighborhoods we need, calculate the predictions.
365  predictions.set_size(combinations.n_cols);
366 
367  size_t user = 0; // Cumulative user count, because we are doing it in order.
368  for (size_t i = 0; i < sortedCombinations.n_cols; ++i)
369  {
370  // Could this be made faster by calculating dot products for multiple items
371  // at once?
372  double rating = 0.0;
373 
374  // Map the combination's user to the user ID used for kNN.
375  while (users[user] < sortedCombinations(0, i))
376  ++user;
377 
378  for (size_t j = 0; j < neighborhood.n_rows; ++j)
379  {
380  rating += weights(j, user) * decomposition.GetRating(
381  neighborhood(j, user), sortedCombinations(1, i));
382  }
383 
384  predictions(ordering[i]) = rating;
385  }
386 
387  // Denormalize ratings.
388  normalization.Denormalize(combinations, predictions);
389 }
390 
391 template<typename DecompositionPolicy,
392  typename NormalizationType>
393 void CFType<DecompositionPolicy,
394  NormalizationType>::
395 CleanData(const arma::mat& data, arma::sp_mat& cleanedData)
396 {
397  // Generate list of locations for batch insert constructor for sparse
398  // matrices.
399  arma::umat locations(2, data.n_cols);
400  arma::vec values(data.n_cols);
401  for (size_t i = 0; i < data.n_cols; ++i)
402  {
403  // We have to transpose it because items are rows, and users are columns.
404  locations(1, i) = ((arma::uword) data(0, i));
405  locations(0, i) = ((arma::uword) data(1, i));
406  values(i) = data(2, i);
407 
408  // The algorithm omits rating of zero. Thus, when normalizing original
409  // ratings in Normalize(), if normalized rating equals zero, it is set
410  // to the smallest positive double value.
411  if (values(i) == 0)
412  Log::Warn << "User rating of 0 ignored for user " << locations(1, i)
413  << ", item " << locations(0, i) << "." << std::endl;
414  }
415 
416  // Find maximum user and item IDs.
417  const size_t maxItemID = (size_t) max(locations.row(0)) + 1;
418  const size_t maxUserID = (size_t) max(locations.row(1)) + 1;
419 
420  // Fill sparse matrix.
421  cleanedData = arma::sp_mat(locations, values, maxItemID, maxUserID);
422 }
423 
425 template<typename DecompositionPolicy,
426  typename NormalizationType>
427 template<typename Archive>
428 void CFType<DecompositionPolicy,
429  NormalizationType>::
430 serialize(Archive& ar, const uint32_t /* version */)
431 {
432  // This model is simple; just serialize all the members. No special handling
433  // required.
434  ar(CEREAL_NVP(numUsersForSimilarity));
435  ar(CEREAL_NVP(rank));
436  ar(CEREAL_NVP(decomposition));
437  ar(CEREAL_NVP(cleanedData));
438  ar(CEREAL_NVP(normalization));
439 }
440 
441 } // namespace cf
442 } // namespace mlpack
443 
444 #endif
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
Definition: hmm_train_main.cpp:300
This class implements Collaborative Filtering (CF).
Definition: cf.hpp:70