mlpack
load_arff_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP
13 #define MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP
14 
15 // In case it hasn't been included yet.
16 #include "load_arff.hpp"
17 
18 #include <boost/algorithm/string/trim.hpp>
19 #include "is_naninf.hpp"
20 
21 namespace mlpack {
22 namespace data {
23 
24 template<typename eT, typename PolicyType>
25 void LoadARFF(const std::string& filename,
26  arma::Mat<eT>& matrix,
28 {
29  // First, open the file.
30  std::ifstream ifs;
31  ifs.open(filename, std::ios::in | std::ios::binary);
32 
33  // if file is not open throw an error (file not found).
34  if (!ifs.is_open())
35  {
36  Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
37  }
38 
39  std::string line;
40  size_t dimensionality = 0;
41  // We'll store a vector of strings representing categories to be mapped, if
42  // needed.
43  std::map<size_t, std::vector<std::string>> categoryStrings;
44  std::vector<bool> types;
45  size_t headerLines = 0;
46  while (ifs.good())
47  {
48  // Read the next line, then strip whitespace from either side.
49  std::getline(ifs, line, '\n');
50  boost::trim(line);
51  ++headerLines;
52 
53  // Is the first character a comment, or is the line empty?
54  if (line[0] == '%' || line.empty())
55  continue; // Ignore this line.
56 
57  // If the first character is @, we are looking at @relation, @attribute, or
58  // @data.
59  if (line[0] == '@')
60  {
61  typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
62  std::string separators = " \t%"; // Split on comments too.
63  boost::escaped_list_separator<char> sep("\\", separators, "\"'");
64  Tokenizer tok(line, sep);
65  Tokenizer::iterator it = tok.begin();
66 
67  // Get the annotation we are looking at.
68  std::string annotation(*it);
69  std::transform(annotation.begin(), annotation.end(), annotation.begin(),
70  ::tolower);
71 
72  if (annotation == "@relation")
73  {
74  // We don't actually have anything to do with the name of the dataset.
75  continue;
76  }
77  else if (annotation == "@attribute")
78  {
79  ++dimensionality;
80  // We need to mark this dimension with its according type.
81  ++it; // Ignore the dimension name.
82  ++it;
83  // Collect all of the remaining tokens, which represent the dimension.
84  std::string dimType = "";
85  while (it != tok.end())
86  dimType += *(it++);
87  std::string origDimType(dimType); // We may need the original cases.
88  std::transform(dimType.begin(), dimType.end(), dimType.begin(),
89  ::tolower);
90 
91  if (dimType == "numeric" || dimType == "integer" || dimType == "real")
92  {
93  types.push_back(false); // The feature is numeric.
94  }
95  else if (dimType == "string")
96  {
97  types.push_back(true); // The feature is categorical.
98  }
99  else if (dimType[0] == '{')
100  {
101  // The feature is categorical, and we have all the types right here.
102  // Note that categories are case-sensitive, and so we must use the
103  // `origDimType` string here instead (which has not had ::tolower used
104  // on it).
105  types.push_back(true);
106  boost::trim_if(origDimType,
107  [](char c)
108  {
109  return c == '{' || c == '}' || c == ' ' || c == '\t';
110  });
111 
112  boost::escaped_list_separator<char> sep("\\", ",", "\"'");
113  Tokenizer dimTok(origDimType, sep);
114  Tokenizer::iterator it = dimTok.begin();
115  std::vector<std::string> categories;
116 
117  while (it != dimTok.end())
118  {
119  std::string category = (*it);
120  boost::trim(category);
121  categories.push_back(category);
122 
123  ++it;
124  }
125 
126  categoryStrings[dimensionality - 1] = std::move(categories);
127  }
128  }
129  else if (annotation == "@data")
130  {
131  // We are in the data section. So we can move out of this loop.
132  break;
133  }
134  else
135  {
136  throw std::runtime_error("unknown ARFF annotation '" + (*tok.begin()) +
137  "'");
138  }
139  }
140  }
141 
142  if (ifs.eof())
143  throw std::runtime_error("no @data section found");
144 
145  // Reset the DatasetInfo object, if needed.
146  if (info.Dimensionality() == 0)
147  {
148  info = DatasetMapper<PolicyType>(dimensionality);
149  }
150  else if (info.Dimensionality() != dimensionality)
151  {
152  std::ostringstream oss;
153  oss << "data::LoadARFF(): given DatasetInfo has dimensionality "
154  << info.Dimensionality() << ", but data has dimensionality "
155  << dimensionality;
156  throw std::invalid_argument(oss.str());
157  }
158 
159  for (size_t i = 0; i < types.size(); ++i)
160  {
161  if (types[i])
162  info.Type(i) = Datatype::categorical;
163  else
164  info.Type(i) = Datatype::numeric;
165  }
166 
167  // Make sure all strings are mapped, if we have any.
168  typedef std::map<size_t, std::vector<std::string>>::const_iterator
169  IteratorType;
170  for (IteratorType it = categoryStrings.begin(); it != categoryStrings.end();
171  ++it)
172  {
173  for (const std::string& str : (*it).second)
174  {
175  info.template MapString<eT>(str, (*it).first);
176  }
177  }
178 
179  // We need to find out how many lines of data are in the file.
180  std::streampos pos = ifs.tellg();
181  size_t row = 0;
182  while (ifs.good())
183  {
184  std::getline(ifs, line, '\n');
185  ++row;
186  }
187  // Uncount the EOF row.
188  --row;
189 
190  // Since we've hit the EOF, we have to call clear() so we can seek again.
191  ifs.clear();
192  ifs.seekg(pos);
193 
194  // Now, set the size of the matrix.
195  matrix.set_size(dimensionality, row);
196 
197  // Now we are looking at the @data section.
198  row = 0;
199  while (ifs.good())
200  {
201  std::getline(ifs, line, '\n');
202  boost::trim(line);
203  // Each line of the @data section must be a CSV (except sparse data, which
204  // we will handle later). So now we can tokenize the
205  // CSV and parse it. The '?' representing a missing value is not allowed,
206  // so if that occurs we throw an exception. We also throw an exception if
207  // any piece of data does not match its type (categorical or numeric).
208 
209  // If the first character is {, it is sparse data, and we can just say this
210  // is not handled for now...
211  if (line[0] == '{')
212  throw std::runtime_error("cannot yet parse sparse ARFF data");
213 
214  // Tokenize the line.
215  typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
216  boost::escaped_list_separator<char> sep("\\", ",", "\"");
217  Tokenizer tok(line, sep);
218 
219  size_t col = 0;
220  std::stringstream token;
221  for (Tokenizer::iterator it = tok.begin(); it != tok.end(); ++it)
222  {
223  // Check that we are not too many columns in.
224  if (col >= matrix.n_rows)
225  {
226  std::stringstream error;
227  error << "Too many columns in line " << (headerLines + row) << ".";
228  throw std::runtime_error(error.str());
229  }
230 
231  // What should this token be?
232  if (info.Type(col) == Datatype::categorical)
233  {
234  // Strip spaces before mapping.
235  std::string token = *it;
236  boost::trim(token);
237  const size_t currentNumMappings = info.NumMappings(col);
238  const eT result = info.template MapString<eT>(token, col);
239 
240  // If the set of categories was pre-specified, then we must crash if
241  // this was not one of those categories.
242  if (categoryStrings.count(col) > 0 &&
243  currentNumMappings < info.NumMappings(col))
244  {
245  std::stringstream error;
246  error << "Parse error at line " << (headerLines + row) << " token "
247  << col << ": category \"" << token << "\" not in the set of known"
248  << " categories for this dimension (";
249  for (size_t i = 0; i < categoryStrings.at(col).size() - 1; ++i)
250  error << "\"" << categoryStrings.at(col)[i] << "\", ";
251  error << "\"" << categoryStrings.at(col).back() << "\").";
252  throw std::runtime_error(error.str());
253  }
254 
255  // We load transposed.
256  matrix(col, row) = result;
257  }
258  else if (info.Type(col) == Datatype::numeric)
259  {
260  // Attempt to read as numeric.
261  token.clear();
262  token.str(*it);
263 
264  eT val = eT(0);
265  token >> val;
266 
267  if (token.fail())
268  {
269  // Check for NaN or inf.
270  if (!IsNaNInf(val, token.str()))
271  {
272  // Okay, it's not NaN or inf. If it's '?', we issue a specific
273  // error, otherwise we issue a general error.
274  std::stringstream error;
275  std::string tokenStr = token.str();
276  boost::trim(tokenStr);
277  if (tokenStr == "?")
278  error << "Missing values ('?') not supported, ";
279  else
280  error << "Parse error ";
281  error << "at line " << (headerLines + row) << " token " << col
282  << ": \"" << tokenStr << "\".";
283  throw std::runtime_error(error.str());
284  }
285  }
286 
287  // If we made it to here, we have a value.
288  matrix(col, row) = val; // We load transposed.
289  }
290 
291  ++col;
292  }
293  ++row;
294  }
295 }
296 
297 } // namespace data
298 } // namespace mlpack
299 
300 #endif
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
Definition: dataset_mapper.hpp:41
void LoadARFF(const std::string &filename, arma::Mat< eT > &matrix)
A utility function to load an ARFF dataset as numeric features (that is, as an Armadillo matrix witho...
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
bool IsNaNInf(T &val, const std::string &token)
See if the token is a NaN or an Inf, and if so, set the value accordingly and return a boolean repres...
Definition: is_naninf.hpp:27
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
Definition: dataset_mapper_impl.hpp:228
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
Definition: dataset_mapper_impl.hpp:196
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
Definition: dataset_mapper_impl.hpp:222