mlpack
load_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_CORE_DATA_LOAD_IMPL_HPP
13 #define MLPACK_CORE_DATA_LOAD_IMPL_HPP
14 
15 // In case it hasn't already been included.
16 
17 #include <exception>
18 #include <algorithm>
20 
21 #include "load_csv.hpp"
22 #include "load.hpp"
23 #include "extension.hpp"
24 #include "detect_file_type.hpp"
25 
26 #include <boost/algorithm/string/trim.hpp>
27 #include <boost/tokenizer.hpp>
28 #include <boost/algorithm/string.hpp>
29 
30 #include "load_arff.hpp"
31 
32 namespace mlpack {
33 namespace data {
34 
35 namespace details{
36 
37 template<typename Tokenizer>
38 std::vector<std::string> ToTokens(Tokenizer& lineTok)
39 {
40  std::vector<std::string> tokens;
41  std::transform(std::begin(lineTok), std::end(lineTok),
42  std::back_inserter(tokens),
43  [&tokens](std::string const &str)
44  {
45  std::string trimmedToken(str);
46  boost::trim(trimmedToken);
47  return std::move(trimmedToken);
48  });
49 
50  return tokens;
51 }
52 
53 inline
54 void TransposeTokens(std::vector<std::vector<std::string>> const &input,
55  std::vector<std::string>& output,
56  size_t index)
57 {
58  output.clear();
59  for (size_t i = 0; i != input.size(); ++i)
60  {
61  output.emplace_back(input[i][index]);
62  }
63 }
64 
65 } // namespace details
66 
67 template <typename MatType>
68 bool inline inplace_transpose(MatType& X, bool fatal)
69 {
70  try
71  {
72  X = arma::trans(X);
73  return true;
74  }
75  catch (const std::exception& e)
76  {
77  if (fatal)
78  Log::Fatal << "\nTranspose Operation Failed.\n"
79  "Exception: " << e.what() << std::endl;
80  else
81  Log::Warn << "\nTranspose Operation Failed.\n"
82  "Exception: " << e.what() << std::endl;
83 
84  return false;
85  }
86 }
87 
88 template<typename eT>
89 bool Load(const std::string& filename,
90  arma::Mat<eT>& matrix,
91  const bool fatal,
92  const bool transpose,
93  const arma::file_type inputLoadType)
94 {
95  Timer::Start("loading_data");
96 
97  // Catch nonexistent files by opening the stream ourselves.
98  std::fstream stream;
99 #ifdef _WIN32 // Always open in binary mode on Windows.
100  stream.open(filename.c_str(), std::fstream::in | std::fstream::binary);
101 #else
102  stream.open(filename.c_str(), std::fstream::in);
103 #endif
104  if (!stream.is_open())
105  {
106  Timer::Stop("loading_data");
107  if (fatal)
108  Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
109  else
110  Log::Warn << "Cannot open file '" << filename << "'; load failed."
111  << std::endl;
112 
113  return false;
114  }
115 
116  arma::file_type loadType = inputLoadType;
117  std::string stringType;
118  if (inputLoadType == arma::auto_detect)
119  {
120  // Attempt to auto-detect the type from the given file.
121  loadType = AutoDetect(stream, filename);
122  // Provide error if we don't know the type.
123  if (loadType == arma::file_type_unknown)
124  {
125  Timer::Stop("loading_data");
126  if (fatal)
127  Log::Fatal << "Unable to detect type of '" << filename << "'; "
128  << "incorrect extension?" << std::endl;
129  else
130  Log::Warn << "Unable to detect type of '" << filename << "'; load "
131  << " failed. Incorrect extension?" << std::endl;
132 
133  return false;
134  }
135  }
136 
137  stringType = GetStringType(loadType);
138 
139 #ifndef ARMA_USE_HDF5
140  if (inputLoadType == arma::hdf5_binary)
141  {
142  // Ensure that HDF5 is supported.
143  Timer::Stop("loading_data");
144  if (fatal)
145  Log::Fatal << "Attempted to load '" << filename << "' as HDF5 data, but "
146  << "Armadillo was compiled without HDF5 support. Load failed."
147  << std::endl;
148  else
149  Log::Warn << "Attempted to load '" << filename << "' as HDF5 data, but "
150  << "Armadillo was compiled without HDF5 support. Load failed."
151  << std::endl;
152 
153  return false;
154  }
155 #endif
156 
157  // Try to load the file; but if it's raw_binary, it could be a problem.
158  if (loadType == arma::raw_binary)
159  Log::Warn << "Loading '" << filename << "' as " << stringType << "; "
160  << "but this may not be the actual filetype!" << std::endl;
161  else
162  Log::Info << "Loading '" << filename << "' as " << stringType << ". "
163  << std::flush;
164 
165  // We can't use the stream if the type is HDF5.
166  bool success;
167  if (loadType != arma::hdf5_binary)
168  success = matrix.load(stream, loadType);
169  else
170  success = matrix.load(filename, loadType);
171 
172  if (!success)
173  {
174  Log::Info << std::endl;
175  Timer::Stop("loading_data");
176  if (fatal)
177  Log::Fatal << "Loading from '" << filename << "' failed." << std::endl;
178  else
179  Log::Warn << "Loading from '" << filename << "' failed." << std::endl;
180 
181  return false;
182  }
183  else
184  Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows)
185  << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n";
186 
187  // Now transpose the matrix, if necessary.
188  if (transpose)
189  {
190  success = inplace_transpose(matrix, fatal);
191  }
192 
193  Timer::Stop("loading_data");
194 
195  // Finally, return the success indicator.
196  return success;
197 }
198 
199 // Load with mappings. Unfortunately we have to implement this ourselves.
200 template<typename eT, typename PolicyType>
201 bool Load(const std::string& filename,
202  arma::Mat<eT>& matrix,
204  const bool fatal,
205  const bool transpose)
206 {
207  // Get the extension and load as necessary.
208  Timer::Start("loading_data");
209 
210  // Get the extension.
211  std::string extension = Extension(filename);
212 
213  // Catch nonexistent files by opening the stream ourselves.
214  std::fstream stream;
215  stream.open(filename.c_str(), std::fstream::in);
216 
217  if (!stream.is_open())
218  {
219  Timer::Stop("loading_data");
220  if (fatal)
221  Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
222  else
223  Log::Warn << "Cannot open file '" << filename << "'; load failed."
224  << std::endl;
225 
226  return false;
227  }
228 
229  if (extension == "csv" || extension == "tsv" || extension == "txt")
230  {
231  Log::Info << "Loading '" << filename << "' as CSV dataset. " << std::flush;
232  try
233  {
234  LoadCSV loader(filename);
235  loader.Load(matrix, info, transpose);
236  }
237  catch (std::exception& e)
238  {
239  Timer::Stop("loading_data");
240  if (fatal)
241  Log::Fatal << e.what() << std::endl;
242  else
243  Log::Warn << e.what() << std::endl;
244 
245  return false;
246  }
247  }
248  else if (extension == "arff")
249  {
250  Log::Info << "Loading '" << filename << "' as ARFF dataset. "
251  << std::flush;
252  try
253  {
254  LoadARFF(filename, matrix, info);
255 
256  // We transpose by default. So, un-transpose if necessary...
257  if (!transpose)
258  {
259  return inplace_transpose(matrix, fatal);
260  }
261  }
262  catch (std::exception& e)
263  {
264  Timer::Stop("loading_data");
265  if (fatal)
266  Log::Fatal << e.what() << std::endl;
267  else
268  Log::Warn << e.what() << std::endl;
269 
270  return false;
271  }
272  }
273  else
274  {
275  // The type is unknown.
276  Timer::Stop("loading_data");
277  if (fatal)
278  Log::Fatal << "Unable to detect type of '" << filename << "'; "
279  << "incorrect extension?" << std::endl;
280  else
281  Log::Warn << "Unable to detect type of '" << filename << "'; load failed."
282  << " Incorrect extension?" << std::endl;
283 
284  return false;
285  }
286 
287  Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows)
288  << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n";
289 
290  Timer::Stop("loading_data");
291 
292  return true;
293 }
294 
295 // For loading data into sparse matrix
296 template <typename eT>
297 bool Load(const std::string& filename,
298  arma::SpMat<eT>& matrix,
299  const bool fatal,
300  const bool transpose)
301 {
302  Timer::Start("loading_data");
303 
304  // Get the extension.
305  std::string extension = Extension(filename);
306 
307  // Catch nonexistent files by opening the stream ourselves.
308  std::fstream stream;
309 #ifdef _WIN32 // Always open in binary mode on Windows.
310  stream.open(filename.c_str(), std::fstream::in | std::fstream::binary);
311 #else
312  stream.open(filename.c_str(), std::fstream::in);
313 #endif
314  if (!stream.is_open())
315  {
316  Timer::Stop("loading_data");
317  if (fatal)
318  Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
319  else
320  Log::Warn << "Cannot open file '" << filename << "'; load failed."
321  << std::endl;
322 
323  return false;
324  }
325 
326  bool unknownType = false;
327  arma::file_type loadType;
328  std::string stringType;
329 
330  if (extension == "tsv" || extension == "txt")
331  {
332  loadType = arma::coord_ascii;
333  stringType = "Coordinate Formatted Data for Sparse Matrix";
334  }
335  else if (extension == "bin")
336  {
337  // This could be raw binary or Armadillo binary (binary with header). We
338  // will check to see if it is Armadillo binary.
339  const std::string ARMA_SPM_BIN = "ARMA_SPM_BIN";
340  std::string rawHeader(ARMA_SPM_BIN.length(), '\0');
341 
342  std::streampos pos = stream.tellg();
343 
344  stream.read(&rawHeader[0], std::streamsize(ARMA_SPM_BIN.length()));
345  stream.clear();
346  stream.seekg(pos); // Reset stream position after peeking.
347 
348  if (rawHeader == ARMA_SPM_BIN)
349  {
350  stringType = "Armadillo binary formatted data for sparse matrix";
351  loadType = arma::arma_binary;
352  }
353  else // We can only assume it's raw binary.
354  {
355  stringType = "raw binary formatted data";
356  loadType = arma::raw_binary;
357  }
358  }
359  else // Unknown extension...
360  {
361  unknownType = true;
362  loadType = arma::raw_binary; // Won't be used; prevent a warning.
363  stringType = "";
364  }
365 
366  // Provide error if we don't know the type.
367  if (unknownType)
368  {
369  Timer::Stop("loading_data");
370  if (fatal)
371  Log::Fatal << "Unable to detect type of '" << filename << "'; "
372  << "incorrect extension?" << std::endl;
373  else
374  Log::Warn << "Unable to detect type of '" << filename << "'; load failed."
375  << " Incorrect extension?" << std::endl;
376 
377  return false;
378  }
379 
380  // Try to load the file; but if it's raw_binary, it could be a problem.
381  if (loadType == arma::raw_binary)
382  Log::Warn << "Loading '" << filename << "' as " << stringType << "; "
383  << "but this may not be the actual filetype!" << std::endl;
384  else
385  Log::Info << "Loading '" << filename << "' as " << stringType << ". "
386  << std::flush;
387 
388  bool success;
389 
390  success = matrix.load(stream, loadType);
391 
392  if (!success)
393  {
394  Log::Info << std::endl;
395  Timer::Stop("loading_data");
396  if (fatal)
397  Log::Fatal << "Loading from '" << filename << "' failed." << std::endl;
398  else
399  Log::Warn << "Loading from '" << filename << "' failed." << std::endl;
400 
401  return false;
402  }
403  else
404  Log::Info << "Size is " << (transpose ? matrix.n_cols : matrix.n_rows)
405  << " x " << (transpose ? matrix.n_rows : matrix.n_cols) << ".\n";
406 
407  // Now transpose the matrix, if necessary.
408  if (transpose)
409  {
410  success = inplace_transpose(matrix, fatal);
411  }
412 
413  Timer::Stop("loading_data");
414 
415  // Finally, return the success indicator.
416  return success;
417 }
418 
419 } // namespace data
420 } // namespace mlpack
421 
422 #endif
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
Definition: dataset_mapper.hpp:41
Load the csv file.This class use boost::spirit to implement the parser, please refer to following lin...
Definition: load_csv.hpp:36
static void Start(const std::string &name)
Start the given timer.
Definition: timers.cpp:28
void LoadARFF(const std::string &filename, arma::Mat< eT > &matrix)
A utility function to load an ARFF dataset as numeric features (that is, as an Armadillo matrix witho...
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void Load(arma::Mat< T > &inout, DatasetMapper< PolicyType > &infoSet, const bool transpose=true)
Load the file into the given matrix with the given DatasetMapper object.
Definition: load_csv.hpp:55
Definition: dtree_impl.hpp:21
std::string GetStringType(const arma::file_type &type)
Given a file type, return a logical name corresponding to that file type.
Definition: detect_file_type.cpp:30
static MLPACK_EXPORT util::PrefixedOutStream Warn
Prints warning messages prefixed with [WARN ].
Definition: log.hpp:87
arma::file_type AutoDetect(std::fstream &stream, const std::string &filename)
Attempt to auto-detect the type of a file given its extension, and by inspecting the parts of the fil...
Definition: detect_file_type.cpp:192
static void Stop(const std::string &name)
Stop the given timer.
Definition: timers.cpp:36
bool Load(const std::string &filename, arma::Mat< eT > &matrix, const bool fatal=false, const bool transpose=true, const arma::file_type inputLoadType=arma::auto_detect)
Loads a matrix from file, guessing the filetype from the extension.
Definition: load_impl.hpp:89
static MLPACK_EXPORT util::PrefixedOutStream Info
Prints informational messages if –verbose is specified, prefixed with [INFO ].
Definition: log.hpp:84