crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
CorpusProperties.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * CorpusProperties.hpp
24  *
25  * Corpus properties (source type, table and field).
26  *
27  * Created on: Feb 2, 2019
28  * Author: ans
29  */
30 
31 #ifndef STRUCT_CORPUSPROPERTIES_HPP_
32 #define STRUCT_CORPUSPROPERTIES_HPP_
33 
34 #include <cstdint> // std::uint16_t
35 #include <string> // std::string
36 #include <vector> // std::vector
37 
38 namespace crawlservpp::Struct {
39 
44 
46 
52  std::uint16_t sourceType{};
53 
55  std::string sourceTable;
56 
58  std::string sourceColumn;
59 
61  std::vector<std::uint16_t> manipulators;
62 
64  std::vector<std::string> models;
65 
67  std::vector<std::string> dictionaries;
68 
70  std::vector<std::string> languages;
71 
73 
83  std::vector<std::uint16_t> savePoints{{}};
84 
86 
90  std::uint64_t freeMemoryEvery{};
91 
93 
97  bool tokenize{false};
98 
102 
104  CorpusProperties() = default;
105 
107 
157  std::uint16_t setSourceType,
158  const std::string& setSourceTable,
159  const std::string& setSourceColumn,
160  const std::vector<std::uint16_t>& setManipulators,
161  const std::vector<std::string>& setModels,
162  const std::vector<std::string>& setDictionaries,
163  const std::vector<std::string>& setLanguages,
164  const std::vector<std::uint16_t>& setSavePoints,
165  std::uint64_t setFreeMemoryEvery
166  ) : sourceType{setSourceType},
167  sourceTable{setSourceTable},
168  sourceColumn{setSourceColumn},
169  manipulators{setManipulators},
170  models{setModels},
171  dictionaries{setDictionaries},
172  languages{setLanguages},
173  savePoints{setSavePoints},
174  freeMemoryEvery{setFreeMemoryEvery},
175  tokenize{true} {}
176 
178 
198  std::uint16_t setSourceType,
199  const std::string& setSourceTable,
200  const std::string& setSourceColumn,
201  std::uint64_t setFreeMemoryEvery
202  ) : sourceType{setSourceType},
203  sourceTable{setSourceTable},
204  sourceColumn{setSourceColumn},
205  freeMemoryEvery{setFreeMemoryEvery} {}
206 
208  };
209 
210 } /* namespace crawlservpp::Struct */
211 
212 #endif /* STRUCT_CORPUSPROPERTIES_HPP_ */
CorpusProperties(std::uint16_t setSourceType, const std::string &setSourceTable, const std::string &setSourceColumn, std::uint64_t setFreeMemoryEvery)
Constructor setting properties for a continuous corpus.
Definition: CorpusProperties.hpp:197
CorpusProperties()=default
Default constructor.
std::uint64_t freeMemoryEvery
Number of processed bytes in a continuous corpus after which memory will be freed.
Definition: CorpusProperties.hpp:90
bool tokenize
Tokenization.
Definition: CorpusProperties.hpp:97
CorpusProperties(std::uint16_t setSourceType, const std::string &setSourceTable, const std::string &setSourceColumn, const std::vector< std::uint16_t > &setManipulators, const std::vector< std::string > &setModels, const std::vector< std::string > &setDictionaries, const std::vector< std::string > &setLanguages, const std::vector< std::uint16_t > &setSavePoints, std::uint64_t setFreeMemoryEvery)
Constructor setting properties for a tokenized corpus.
Definition: CorpusProperties.hpp:156
Corpus properties containing the type, table, and column name of its source.
Definition: CorpusProperties.hpp:41
std::uint16_t sourceType
The type of the source from which the corpus is created (see below).
Definition: CorpusProperties.hpp:52
std::vector< std::string > models
The models used by the manipulators with the same array index.
Definition: CorpusProperties.hpp:64
std::vector< std::uint16_t > savePoints
List of savepoints.
Definition: CorpusProperties.hpp:83
Namespace for data structures.
Definition: AlgoThreadProperties.hpp:43
std::string sourceColumn
The name of the table column from which the corpus is created.
Definition: CorpusProperties.hpp:58
std::vector< std::uint16_t > manipulators
The IDs of manipulators for preprocessing the corpus.
Definition: CorpusProperties.hpp:61
std::vector< std::string > languages
The languages used by the manipulators with the same array index.
Definition: CorpusProperties.hpp:70
std::string sourceTable
The name of the table from which the corpus is created.
Definition: CorpusProperties.hpp:55
std::vector< std::string > dictionaries
The dictionaries used by the manipulators with the same array index.
Definition: CorpusProperties.hpp:67