12 #ifndef MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP 13 #define MLPACK_CORE_DATA_LOAD_ARFF_IMPL_HPP 18 #include <boost/algorithm/string/trim.hpp> 24 template<
typename eT,
typename PolicyType>
26 arma::Mat<eT>& matrix,
31 ifs.open(filename, std::ios::in | std::ios::binary);
36 Log::Fatal <<
"Cannot open file '" << filename <<
"'. " << std::endl;
40 size_t dimensionality = 0;
43 std::map<size_t, std::vector<std::string>> categoryStrings;
44 std::vector<bool> types;
45 size_t headerLines = 0;
49 std::getline(ifs, line,
'\n');
54 if (line[0] ==
'%' || line.empty())
61 typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
62 std::string separators =
" \t%";
63 boost::escaped_list_separator<char> sep(
"\\", separators,
"\"'");
64 Tokenizer tok(line, sep);
65 Tokenizer::iterator it = tok.begin();
68 std::string annotation(*it);
69 std::transform(annotation.begin(), annotation.end(), annotation.begin(),
72 if (annotation ==
"@relation")
77 else if (annotation ==
"@attribute")
84 std::string dimType =
"";
85 while (it != tok.end())
87 std::string origDimType(dimType);
88 std::transform(dimType.begin(), dimType.end(), dimType.begin(),
91 if (dimType ==
"numeric" || dimType ==
"integer" || dimType ==
"real")
93 types.push_back(
false);
95 else if (dimType ==
"string")
97 types.push_back(
true);
99 else if (dimType[0] ==
'{')
105 types.push_back(
true);
106 boost::trim_if(origDimType,
109 return c ==
'{' || c ==
'}' || c ==
' ' || c ==
'\t';
112 boost::escaped_list_separator<char> sep(
"\\",
",",
"\"'");
113 Tokenizer dimTok(origDimType, sep);
114 Tokenizer::iterator it = dimTok.begin();
115 std::vector<std::string> categories;
117 while (it != dimTok.end())
119 std::string category = (*it);
120 boost::trim(category);
121 categories.push_back(category);
126 categoryStrings[dimensionality - 1] = std::move(categories);
129 else if (annotation ==
"@data")
136 throw std::runtime_error(
"unknown ARFF annotation '" + (*tok.begin()) +
143 throw std::runtime_error(
"no @data section found");
152 std::ostringstream oss;
153 oss <<
"data::LoadARFF(): given DatasetInfo has dimensionality " 156 throw std::invalid_argument(oss.str());
159 for (
size_t i = 0; i < types.size(); ++i)
162 info.
Type(i) = Datatype::categorical;
164 info.
Type(i) = Datatype::numeric;
168 typedef std::map<size_t, std::vector<std::string>>::const_iterator
170 for (IteratorType it = categoryStrings.begin(); it != categoryStrings.end();
173 for (
const std::string& str : (*it).second)
175 info.template MapString<eT>(str, (*it).first);
180 std::streampos pos = ifs.tellg();
184 std::getline(ifs, line,
'\n');
195 matrix.set_size(dimensionality, row);
201 std::getline(ifs, line,
'\n');
212 throw std::runtime_error(
"cannot yet parse sparse ARFF data");
215 typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
216 boost::escaped_list_separator<char> sep(
"\\",
",",
"\"");
217 Tokenizer tok(line, sep);
220 std::stringstream token;
221 for (Tokenizer::iterator it = tok.begin(); it != tok.end(); ++it)
224 if (col >= matrix.n_rows)
226 std::stringstream error;
227 error <<
"Too many columns in line " << (headerLines + row) <<
".";
228 throw std::runtime_error(error.str());
232 if (info.
Type(col) == Datatype::categorical)
235 std::string token = *it;
237 const size_t currentNumMappings = info.
NumMappings(col);
238 const eT result = info.template MapString<eT>(token, col);
242 if (categoryStrings.count(col) > 0 &&
245 std::stringstream error;
246 error <<
"Parse error at line " << (headerLines + row) <<
" token " 247 << col <<
": category \"" << token <<
"\" not in the set of known" 248 <<
" categories for this dimension (";
249 for (
size_t i = 0; i < categoryStrings.at(col).size() - 1; ++i)
250 error <<
"\"" << categoryStrings.at(col)[i] <<
"\", ";
251 error <<
"\"" << categoryStrings.at(col).back() <<
"\").";
252 throw std::runtime_error(error.str());
256 matrix(col, row) = result;
258 else if (info.
Type(col) == Datatype::numeric)
274 std::stringstream error;
275 std::string tokenStr = token.str();
276 boost::trim(tokenStr);
278 error <<
"Missing values ('?') not supported, ";
280 error <<
"Parse error ";
281 error <<
"at line " << (headerLines + row) <<
" token " << col
282 <<
": \"" << tokenStr <<
"\".";
283 throw std::runtime_error(error.str());
288 matrix(col, row) = val;
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
Definition: dataset_mapper.hpp:41
void LoadARFF(const std::string &filename, arma::Mat< eT > &matrix)
A utility function to load an ARFF dataset as numeric features (that is, as an Armadillo matrix witho...
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
bool IsNaNInf(T &val, const std::string &token)
See if the token is a NaN or an Inf, and if so, set the value accordingly and return a boolean repres...
Definition: is_naninf.hpp:27
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
Definition: dataset_mapper_impl.hpp:228
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
Definition: dataset_mapper_impl.hpp:196
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
Definition: dataset_mapper_impl.hpp:222