crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
OpenDocument.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * OpenDocument.hpp
24  *
25  * Namespace for functions to import/export data from/to files in the OpenDocument format.
26  *
27  * Created on: Jan 4, 2021
28  * Author: ans
29  */
30 
31 #ifndef DATA_IMPORTEXPORT_OPENDOCUMENT_HPP_
32 #define DATA_IMPORTEXPORT_OPENDOCUMENT_HPP_
33 
34 #include "../Compression/Zip.hpp"
35 
36 #include "../../Helper/Strings.hpp"
37 
38 #include <clocale> // std::setlocale
39 #include <cstdint> // std::uint8_t
40 #include <string> // std::stod, std::string, std::to_string
41 #include <utility> // std::pair
42 #include <vector> // std::vector
43 
46 
47  /*
48  * CONSTANTS
49  */
50 
52  inline constexpr auto cellSpacing{5};
53 
55  inline constexpr auto cellLines{3};
56 
58  inline constexpr auto cellConstChars{57};
59 
60  /*
61  * DECLARATION
62  */
63 
65  using StringString = std::pair<std::string, std::string>;
66 
68  using TableRow = std::vector<std::string>;
69 
71  using Table = std::vector<TableRow>;
72 
74  using NamedTable = std::pair<std::string, Table>;
75 
78 
79  std::string exportSpreadsheet(
80  const std::vector<NamedTable>& tables,
81  bool firstRowBold
82  );
83 
87 
88  std::string cell(std::uint8_t spacing, const std::string& raw, const std::string& style);
89 
91 
92  /*
93  * IMPLEMENTATION
94  */
95 
97 
117  inline std::string exportSpreadsheet(const std::vector<NamedTable>& tables, bool firstRowBold) {
118  std::vector<StringString> fileContents;
119 
120  // add MIME type
121  fileContents.emplace_back("mimetype", "application/vnd.oasis.opendocument.spreadsheet");
122 
123  // add package manifest
124  fileContents.emplace_back(
125  "META-INF/manifest.xml",
126  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
127  "<manifest:manifest manifest-version=\"1.2\""
128  " xmlns:manifest=\"urn:oasis:names:tc:opendocument:xmlns:manifest:1.0\">\n"
129  " <manifest:file-entry manifest:full-path=\"/\" manifest:version=\"1.2\""
130  " manifest:media-type=\"application/vnd.oasis.opendocument.spreadsheet\"/>"
131  " <manifest:file-entry manifest:full-path=\"content.xml\" manifest:media-type=\"text/xml\"/>\n"
132  "</manifest:manifest>"
133  );
134 
135  // create content
136  std::string content{
137  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
138  "<office:document-content"
139  " xmlns:office=\"urn:oasis:names:tc:opendocument:xmlns:office:1.0\""
140  " xmlns:table=\"urn:oasis:names:tc:opendocument:xmlns:table:1.0\""
141  " xmlns:text=\"urn:oasis:names:tc:opendocument:xmlns:text:1.0\""
142  " xmlns:style=\"urn:oasis:names:tc:opendocument:xmlns:style:1.0\""
143  " xmlns:fo=\"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0\">"
144  " <office:automatic-styles>\n"
145  " <style:style style:name=\"headings\" style:family=\"table-cell\">\n"
146  " <style:text-properties fo:font-weight=\"bold\" style:font-weight-asian=\"bold\""
147  " style:font-weight-complex=\"bold\" />\n"
148  " </style:style>\n"
149  " </office:automatic-styles>\n"
150  " <office:body>\n"
151  " <office:spreadsheet>\n"
152  };
153 
154  for(const auto& table : tables) {
155  content += " <table:table table:name=\"" + table.first + "\">\n";
156 
157  bool boldRow{firstRowBold};
158 
159  for(const auto& row : table.second) {
160  std::string style;
161 
162  if(boldRow) {
163  style = "table:style-name=\"headings\"";
164 
165  boldRow = false;
166  }
167 
168  content += " <table:table-row>\n";
169 
170  for(const auto& cell : row) {
171  content += OpenDocument::cell(cellSpacing, cell, style);
172  }
173 
174  content += " </table:table-row>\n";
175  }
176 
177  content += " </table:table>\n";
178  }
179 
180  content += " </office:spreadsheet>\n"
181  " </office:body>\n"
182  "</office:document-content>\n";
183 
184  fileContents.emplace_back("content.xml", content);
185 
186  return Data::Compression::Zip::compress(fileContents);
187  }
188 
190 
214  inline std::string cell(std::uint8_t spacing, const std::string& raw, const std::string& style) {
215  const std::string spaces(spacing, ' ');
216 
217  if(raw.empty()) {
218  return spaces + "<table:table-cell />\n";
219  }
220 
221  std::string content{raw};
222  std::string attributes{};
223  double numericValue{};
224  bool isString{true};
225 
226  if(!style.empty()) {
227  attributes.reserve(style.size() + 1);
228 
229  attributes.push_back(' ');
230 
231  attributes += style;
232  }
233 
234  if(Helper::Strings::isDec(raw)) {
235  // try to convert to numeric value
236  const auto * oldLocale{
237  std::setlocale(LC_NUMERIC, "C")
238  };
239 
240  try {
241  numericValue = std::stod(raw);
242 
243  isString = false;
244 
245  attributes += R"( office:value-type="float" office:value=")";
246  attributes += std::to_string(numericValue);
247  attributes += "\"";
248  }
249  catch(const std::logic_error& /*unused*/) {}
250 
251  // reset C locale
252  std::setlocale(LC_NUMERIC, oldLocale);
253  }
254 
255  if(isString) {
256  // replace special characters
257  Helper::Strings::replaceAll(content, "&", "&amp;");
258  Helper::Strings::replaceAll(content, "'", "&apos;");
259  Helper::Strings::replaceAll(content, ">", "&gt;");
260  Helper::Strings::replaceAll(content, "<", "&lt;");
261  Helper::Strings::replaceAll(content, "\"", "&quot;");
262  }
263 
264  std::string result{};
265 
266  result.reserve(cellLines * spaces.size() + attributes.size() + content.size() + cellConstChars);
267 
268  result = spaces;
269 
270  result += "<table:table-cell";
271  result += attributes;
272  result += ">\n";
273  result += spaces;
274  result += "<text:p>";
275  result += content;
276  result += "</text:p>\n";
277  result += spaces;
278  result += "</table:table-cell>\n";
279 
280  return result;
281  }
282 
283 } /* namespace crawlservpp::Data::ImportExport::OpenDocument */
284 
285 #endif /* DATA_IMPORTEXPORT_OPENDOCUMENT_HPP_ */
constexpr auto cellConstChars
The number of additional characters for a OpenDocument XML cell element and its content.
Definition: OpenDocument.hpp:58
std::vector< TableRow > Table
A vector of vectors of strings used as spreadsheet tables.
Definition: OpenDocument.hpp:71
std::vector< std::string > TableRow
A vector of strings used as rows in a spreadsheet table.
Definition: OpenDocument.hpp:68
std::pair< std::string, Table > NamedTable
A pair containing the name and the content of a spreadsheet table.
Definition: OpenDocument.hpp:74
std::string compress(const std::vector< StringString > &fileContents)
Compresses files using zip.
Definition: Zip.hpp:88
constexpr auto cellLines
The number of lines used for a OpenDocument XML cell element and its content.
Definition: OpenDocument.hpp:55
std::pair< std::string, std::string > StringString
A pair of strings.
Definition: OpenDocument.hpp:65
void replaceAll(std::string &strInOut, std::string_view needle, std::string_view replacement)
Replaces all occurences within a string with another string.
Definition: Strings.hpp:246
constexpr auto cellSpacing
The number of spaces before a OpenDocument XML cell element.
Definition: OpenDocument.hpp:52
bool isDec(std::string_view inputString)
Checks whether a string contains only decimal digits and max. one dot (.).
Definition: Strings.hpp:303
Namespace for importing and exporting OpenDocument spreadsheets.
Definition: OpenDocument.hpp:45
std::string exportSpreadsheet(const std::vector< NamedTable > &tables, bool firstRowBold)
Exports tables as a OpenDocument spreadsheet.
Definition: OpenDocument.hpp:117
std::string cell(std::uint8_t spacing, const std::string &raw, const std::string &style)
Creates the XML code for a simple cell containing a value.
Definition: OpenDocument.hpp:214