JASSv2
serialise_jass_v1.h
Go to the documentation of this file.
1 /*
2  SERIALISE_JASS_V1.H
3  -------------------
4  Copyright (c) 2016 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
13 #pragma once
14 
15 #include "file.h"
16 #include "slice.h"
17 #include "allocator_cpp.h"
18 #include "index_postings.h"
19 #include "index_manager.h"
22 
23 namespace JASS
24  {
25  /*
26  CLASS SERIALISE_JASS_V1
27  -----------------------
28  */
71  {
72  protected:
73  /*
74  CLASS SERIALISE_JASS_V1::VOCAB_TRIPPLE
75  --------------------------------------
76  */
81  {
82  public:
84  uint64_t term;
85  uint64_t offset;
86  uint64_t impacts;
87 
88  public:
89  /*
90  SERIALISE_JASS_V1::VOCAB_TRIPPLE::VOCAB_TRIPPLE()
91  --------------------------------------------------
92  */
100  vocab_tripple(const slice &string, uint64_t term, uint64_t offset, uint64_t impacts) :
101  token(string),
102  term(term),
103  offset(offset),
104  impacts(impacts)
105  {
106  /* Nothing */
107  }
108 
109  /*
110  SERIALISE_JASS_V1::VOCAB_TRIPPLE::OPERATOR<()
111  ----------------------------------------------
112  */
118  bool operator<(const vocab_tripple &other) const
119  {
120  return slice::strict_weak_order_less_than(token, other.token);
121  }
122  };
123 
124  public:
125  /*
126  ENUM JASS_V1_CODEX
127  ------------------
128  */
133  {
134  uncompressed = 's',
136  simple_8b = '8',
137  qmx = 'q',
138  qmx_d4 = 'Q',
139  qmx_d0 = 'R',
143  };
144 
145  protected:
150  std::vector<vocab_tripple> index_key;
151  std::vector<uint64_t> primary_key_offsets;
154  std::string compressor_name;
158  std::vector<uint8_t, allocator_cpp<uint8_t>> compressed_buffer;
159  std::vector<slice, allocator_cpp<slice>> compressed_segments;
160  uint8_t alignment;
161 
162  protected:
163  /*
164  SERIALISE_JASS_V1::WRITE_POSTINGS()
165  -----------------------------------
166  */
176  virtual size_t write_postings(const index_postings &postings, size_t &number_of_impacts, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies);
177 
178  public:
179  /*
180  SERIALISE_JASS_V1::SERIALISE_JASS_V1()
181  --------------------------------------
182  */
189  serialise_jass_v1(size_t documents, jass_v1_codex codex = jass_v1_codex::elias_gamma_simd, int8_t alignment = 1) :
190  index_manager::delegate(documents),
191  vocabulary_strings("CIvocab_terms.bin", "w+b"),
192  vocabulary("CIvocab.bin", "w+b"),
193  postings("CIpostings.bin", "w+b"),
194  primary_keys("CIdoclist.bin", "w+b"),
195  memory(1024 * 1024),
196  impact_ordered(documents, memory),
197  encoder(get_compressor(codex, compressor_name, compressor_d_ness)),
198  allocator(memory),
199  compressed_buffer(allocator),
200  compressed_segments(allocator),
201  alignment(alignment)
202  {
203  /*
204  Allocate space for storing the compressed postings. But, allocate too much space as some
205  encoders can't write a sequence smaller than a minimum size.
206 
207  How large does this need to be?
208  We store the:
209  postings, each docid is 8 bytes and there are |documents| of those
210  The impact header consisting of an impact (2 bytes) + start_pointer (8 bytes) + length (8 bytes) + frequency (4 bytes)
211  There are index_postings_impact::largest_impact of those.
212  To make things worse, each of these are stored compressed, and so might be bigger than the
213  raw size by 8/7 (assuming variable byte), so the raw storage is:
214  8/7 *(documents * 8 + 22 * index_postings_impact::largest_impact)
215  but, each of these two things is stored in a vector as a slice and each slice takes
216  8 bytes for the address and 8 bytes for the size giving an additional 2 * 16 * index_postings_impact::largest_impact
217  giving a total of
218  8/7 *(documents * 8 + (22 + 2 * 16) * index_postings_impact::largest_impact)
219  and now lets add a bit for reasons we can't predict (the std::vector has house-keeping)
220  1 MB
221  and make sure integer rounding doesn't get this wrong:
222  8 * (documents * 8 + (22 + 2 * 16) * index_postings_impact::largest_impact) / 7 + 1024 * 1024
223  */
224  compressed_buffer.resize(8 * (documents * 8 + (22 + 2 * 16) * index_postings_impact::largest_impact) / 7 + 1024 * 1024);
225  compressed_segments.reserve(index_postings_impact::largest_impact);
226 
227 // std::cout << compressor_name << "-D" << compressor_d_ness << "\n";
228 
229  postings.write(&codex, 1);
230  }
231 
232  /*
233  SERIALISE_JASS_V1::~SERIALISE_JASS_V1()
234  --------------------------------------
235  */
240  {
241  /* Nothing */
242  }
243 
244  /*
245  SERIALISE_JASS_V1::FINISH()
246  ---------------------------
247  */
251  virtual void finish(void);
252 
253  /*
254  SERIALISE_JASS_V1::SERIALISE_VOCABULARY_POINTERS()
255  --------------------------------------------------
256  */
260  virtual void serialise_vocabulary_pointers(void);
261 
262  /*
263  SERIALISE_JASS_V1::SERIALISE_PRIMARY_KEYS()
264  --------------------------------------------
265  */
269  virtual void serialise_primary_keys(void);
270 
271  /*
272  SERIALISE_JASS_V1::DELEGATE::OPERATOR()()
273  -----------------------------------------
274  */
283  virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies);
284 
285  /*
286  SERIALISE_JASS_V1::DELEGATE::OPERATOR()()
287  -----------------------------------------
288  */
294  virtual void operator()(size_t document_id, const slice &primary_key);
295 
296 
297  /*
298  SERIALISE_JASS_V1::GET_COMPRESSOR()
299  -----------------------------------
300  */
308  static compress_integer *get_compressor(jass_v1_codex codex, std::string &name, int32_t &d_ness);
309 
310  /*
311  SERIALISE_JASS_V1::UNITTEST()
312  -----------------------------
313  */
317  static void unittest(void);
318  };
319  }
file vocabulary
Details about the term (including a pointer to the term, a pointer to the postings, and the quantum count.
Definition: serialise_jass_v1.h:147
Non-thread-safe object that accumulates a single postings list during indexing.
Definition: index_postings.h:40
std::vector< vocab_tripple > index_key
The entry point into the JASS v1 index is CIvocab.bin, the index key.
Definition: serialise_jass_v1.h:150
static bool strict_weak_order_less_than(const slice &me, const slice &with)
Return true if this < with.
Definition: slice.h:313
virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function to serialise the postings (given the term) is operator().
Definition: serialise_jass_v1.cpp:199
C++ slices (string-descriptors)
Definition: slice.h:27
Non-thread-Safe object that holds a single postings list during indexing.
std::vector< uint64_t > primary_key_offsets
A list of locations (on disk) of each primary key.
Definition: serialise_jass_v1.h:151
virtual size_t write_postings(const index_postings &postings, size_t &number_of_impacts, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
Convert the postings list to the JASS v1 format and serialise it to disk.
Definition: serialise_jass_v1.cpp:76
Base class for the indexer object that stored the actual index during indexing.
Compression codexes for integer sequences.
Definition: compress_integer.h:34
The tripple used in CIvocab.bin.
Definition: serialise_jass_v1.h:80
virtual ~serialise_jass_v1()
Destructor.
Definition: serialise_jass_v1.h:239
std::string compressor_name
The name of the compresson algorithm.
Definition: serialise_jass_v1.h:154
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
uint64_t offset
The pointer to the postings stored in the CIpostings.bin file.
Definition: serialise_jass_v1.h:85
allocator_pool memory
Memory used to store the impact-ordered postings list.
Definition: serialise_jass_v1.h:152
Postings are compressed using Elias gamma SIMD encoding with variable byte endings.
Definition: serialise_jass_v1.h:141
vocab_tripple(const slice &string, uint64_t term, uint64_t offset, uint64_t impacts)
Constructor.
Definition: serialise_jass_v1.h:100
Partial file and whole file based I/O methods.
Postings are not compressed.
Definition: serialise_jass_v1.h:134
virtual void serialise_primary_keys(void)
Serialise the primary keys (or any extra stuff at the end of the primary key file).
Definition: serialise_jass_v1.cpp:61
Postings are compressed using ATIRE&#39;s variable byte encoding.
Definition: serialise_jass_v1.h:135
size_t write(const void *data, size_t size)
Write bytes number of bytes to the give file at the current cursor position.
Definition: file.h:315
Postings are compressed using ATIRE&#39;s simple-8b encoding.
Definition: serialise_jass_v1.h:136
Serialise an index in the experimental JASS-CI format used (by JASS version 1) in the RIGOR workshop...
Definition: serialise_jass_v1.h:70
virtual void serialise_vocabulary_pointers(void)
Serialise the pointers that point between the vocab and the postings (the CIvocab.bin file).
Definition: serialise_jass_v1.cpp:39
allocator_cpp< uint8_t > allocator
C++ allocator between memory object and std::vector object.
Definition: serialise_jass_v1.h:157
static compress_integer * get_compressor(jass_v1_codex codex, std::string &name, int32_t &d_ness)
Return a reference to a compressor/decompressor that can be used with this index. ...
Definition: serialise_jass_v1.cpp:241
Postings are compressed using QMX with Lemire&#39;s D4 delta encoding.
Definition: serialise_jass_v1.h:138
index_postings_impact impact_ordered
The re-used impact ordered postings list.
Definition: serialise_jass_v1.h:153
C++11 allocator class that uses a C allocator. See here: https://msdn.microsoft.com/en-us/library/aa9...
Postings are compressed using Elias delta SIMD encoding.
Definition: serialise_jass_v1.h:142
Holder class for an impact ordered postings list.
Definition: index_postings_impact.h:31
static constexpr size_t largest_impact
The largest allowable immpact score (255 is an good value).
Definition: index_postings_impact.h:42
slice token
The term as a string (needed for sorting the std::vector vocab_tripple array later) ...
Definition: serialise_jass_v1.h:83
compress_integer * encoder
The integer encoder used to compress postings lists.
Definition: serialise_jass_v1.h:156
Simple block-allocator that internally allocates a large chunk then allocates smaller blocks from thi...
Definition: allocator_pool.h:61
uint8_t alignment
Postings lists are padded to this alignment (used for codexes that require word alignment).
Definition: serialise_jass_v1.h:160
serialise_jass_v1(size_t documents, jass_v1_codex codex=jass_v1_codex::elias_gamma_simd, int8_t alignment=1)
Constructor.
Definition: serialise_jass_v1.h:189
uint64_t term
The pointer to the \0 terminated string in the CI_vovab_terms.bin file.
Definition: serialise_jass_v1.h:84
uint64_t impacts
The number of impacts that exist for this term.
Definition: serialise_jass_v1.h:86
delegate(size_t documents)
Destructor.
Definition: index_manager.h:60
file vocabulary_strings
The concatination of UTS-8 encoded unique tokens in the collection.
Definition: serialise_jass_v1.h:146
file primary_keys
The list of external identifiers (document primary keys).
Definition: serialise_jass_v1.h:149
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
bool operator<(const vocab_tripple &other) const
Compare (using strcmp() colaiting sequence) this object with another for less than.
Definition: serialise_jass_v1.h:118
virtual void finish(void)
Finish up any serialising that needs to be done.
Definition: serialise_jass_v1.cpp:22
Postings are compressed using JASS v1&#39;s variant of QMX (with difference (D1) encoding).
Definition: serialise_jass_v1.h:137
std::vector< uint8_t, allocator_cpp< uint8_t > > compressed_buffer
The buffer used to compress postings into.
Definition: serialise_jass_v1.h:158
jass_v1_codex
The compression scheme that is active.
Definition: serialise_jass_v1.h:132
int compressor_d_ness
The d-ness of the compression algorithm.
Definition: serialise_jass_v1.h:155
QMX version compatible with JASS v1.
file postings
The postings lists.
Definition: serialise_jass_v1.h:148
Slices (also known as string-descriptors) for C++.
size_t documents
The number of documents in the collection.
Definition: index_manager.h:50
File based I/O methods including whole file and partial files.
Definition: file.h:45
Base class for holding the index during indexing.
Definition: index_manager.h:33
std::vector< slice, allocator_cpp< slice > > compressed_segments
vector of pointers (and lengths) to the compressed postings.
Definition: serialise_jass_v1.h:159
Base class for the callback function called by iterate.
Definition: index_manager.h:47
static void unittest(void)
Unit test this class.
Definition: serialise_jass_v1.cpp:273
Definition: compress_integer_elias_delta_simd.c:23
Pack 32-bit integers into 512-bit SIMD words using elias gamma encoding.
Postings are compressed using Elias gamma SIMD encoding.
Definition: serialise_jass_v1.h:140
Postings are compressed using QMX without delta encoding.
Definition: serialise_jass_v1.h:139