JASSv2
ciff_lin.h
Go to the documentation of this file.
1 /*
2  CIFF_LIN.H
3  ----------
4  Copyright (c) 2019 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
39 #pragma once
40 
41 #include <stdint.h>
42 
43 #include <vector>
44 #include <limits>
45 
46 #include "posting.h"
47 #include "protobuf.h"
48 
49 namespace JASS
50  {
51  /*
52  CLASS CIFF_LIN
53  --------------
54  */
127  class ciff_lin
128  {
129  public:
135  {
136  OK = 0,
137  FAIL = 1
138  } ;
139 
140  /*
141  CLASS CIFF_LIN::HEADER
142  ----------------------
143  */
147  class header
148  {
149  public:
150  int32_t version;
152  int32_t num_docs;
154  int32_t total_docs;
157  std::string description;
158 
159  public:
160  /*
161  CIFF_LIN::HEADER::HEADER()
162  --------------------------
163  */
168  version(0),
169  num_postings_lists(0),
170  num_docs(0),
171  total_postings_lists(0),
172  total_docs(0),
173  total_terms_in_collection(0),
174  average_doclength(0),
175  description("")
176  {
177  /* Nothing */
178  }
179 
180  /*
181  CIFF_LIN::HEADER::TEXT_RENDER()
182  -------------------------------
183  */
187  void text_render(void)
188  {
189  std::cout << "version:" << version << "\n";
190  std::cout << "num_postings_lists:" << num_postings_lists << "\n";
191  std::cout << "num_docs:" << num_docs << "\n";
192  std::cout << "total_postings_lists:" << total_postings_lists << "\n";
193  std::cout << "total_docs:" << total_docs << "\n";
194  std::cout << "total_terms_in_collection:" << total_terms_in_collection << "\n";
195  std::cout << "average_doclength:" << average_doclength << "\n";
196  std::cout << "description:" << description << "\n";
197  }
198  };
199 
200  /*
201  CLASS CIFF_LIN::POSTINGS_LIST
202  -----------------------------
203  */
208  {
209  public:
213  std::vector<posting> postings;
214 
215  private:
216  /*
217  CIFF_LIN::POSTINGS_LIST::GET_NEXT_POSTINGS_PAIR()
218  -------------------------------------------------
219  */
227  static error_code get_next_postings_pair(posting &into, const uint8_t *&stream, const uint8_t *stream_end)
228  {
229  while (stream < stream_end)
230  {
231  protobuf::wire_type type;
232  uint8_t field = protobuf::get_type_and_field(type, stream);
233 
234  if (type == protobuf::VARINT)
235  {
236  uint64_t value = protobuf::get_uint64_t(stream);
237  if (field == 1)
238  into.docid = static_cast<uint32_t>(value);
239  else if (field == 2)
240  into.term_frequency = static_cast<uint32_t>(value);
241  else
242  return FAIL;
243  }
244  else
245  return FAIL;
246  }
247  return OK;
248  }
249 
250  public:
251  /*
252  CIFF_LIN::POSTINGS_LIST::GET_NEXT_POSTINGS()
253  --------------------------------------------
254  */
262  static error_code get_next_postings(postings_list &into, const uint8_t *&stream, const uint8_t *stream_end)
263  {
264  while (stream < stream_end)
265  {
266  protobuf::wire_type type;
267  uint8_t field = protobuf::get_type_and_field(type, stream);
268 
269  if (type == protobuf::VARINT)
270  {
271  uint64_t value = protobuf::get_uint64_t(stream);
272 
273  if (field == 2)
274  into.document_frequency = value;
275  else if (field == 3)
276  into.collection_frequency = value;
277  else
278  return FAIL;
279  }
280  else if (type == protobuf::BLOB)
281  {
282  slice term = protobuf::get_blob(stream);
283 
284  if (field == 1)
285  into.term = term;
286  else if (field == 4)
287  {
288  const uint8_t *here = reinterpret_cast<const uint8_t *> (term.address());
289  posting d_tf_pair;
290  if (get_next_postings_pair(d_tf_pair, here, here + term.size()) == FAIL)
291  return FAIL;
292  into.postings.push_back(d_tf_pair);
293  }
294  }
295  else
296  return FAIL;
297  }
298  return OK;
299  }
300 
301  /*
302  CIFF_LIN::POSTINGS_LIST::CLEAR()
303  --------------------------------
304  */
308  void clear(void)
309  {
310  term.clear();
311  document_frequency = 0;
312  collection_frequency = 0;
313  postings.clear();
314  }
315  };
316 
317  /*
318  CLASS CIFF_LIN::DOC_RECORD
319  --------------------------
320  */
325  {
326  public:
327  int32_t docid;
329  int32_t doclength;
330 
331  public:
332  /*
333  CIFF_LIN::DOC_RECORD::DOC_RECORD()
334  ----------------------------------
335  */
340  docid(0),
341  doclength(0)
342  {
343  /* Nothing */
344  }
345 
346  /*
347  CIFF_LIN::DOC_RECORD::CLEAR()
348  -----------------------------
349  */
353  void clear(void)
354  {
355  docid = 0;
356  collection_docid.clear();
357  doclength = 0;
358  }
359 
360  /*
361  CIFF_LIN::DOC_RECORD::GET_NEXT()
362  --------------------------------
363  */
367  static error_code get_next(doc_record &into, const uint8_t *&stream)
368  {
369  size_t length = protobuf::get_uint64_t(stream);
370  const uint8_t *stream_end = stream + length;
371 
372  while (stream < stream_end)
373  {
374  protobuf::wire_type type;
375  uint8_t field = protobuf::get_type_and_field(type, stream);
376 
377  if (type == protobuf::VARINT)
378  {
379  uint64_t value = protobuf::get_uint64_t(stream);
380 
381  if (field == 1)
382  into.docid = (uint32_t)value;
383  else if (field == 3)
384  into.doclength = (uint32_t)value;
385  else
386  return FAIL;
387  }
388  else if (type == protobuf::BLOB)
389  {
390  slice term = protobuf::get_blob(stream);
391  if (field == 2)
392  into.collection_docid = term;
393  else
394  return FAIL;
395  }
396  else
397  return FAIL;
398  }
399  return OK;
400  }
401  };
402 
403  private:
404  /*
405  CLASS CIFF_LIN::POSTINGS_LIST_ITERATOR
406  --------------------------------------
407  */
412  {
413  private:
415  const uint8_t *stream;
416  size_t which;
418 
419  public:
420  /*
421  CIFF_LIN::POSTINGS_LIST_ITERATOR::POSTINGS_LIST_ITERATOR()
422  ----------------------------------------------------------
423  */
429  postings_list_iterator(ciff_lin &of, size_t which) :
430  source(of),
431  stream(source.stream),
432  which(which)
433  {
434  /* Nothing */
435  }
436 
437  /*
438  CIFF_LIN::POSTINGS_LIST_ITERATOR::OPERATOR++()
439  ----------------------------------------------
440  */
446  {
447  which++;
448  return *this;
449  }
450 
451  /*
452  CIFF_LIN::POSTINGS_LIST_ITERATOR::OPERATOR*()
453  ---------------------------------------------
454  */
460  {
461  int64_t postings_list_length = protobuf::get_uint64_t(stream);
462 
463  postings.clear();
464  if (postings_list::get_next_postings(postings, stream, stream + postings_list_length) == FAIL)
465  {
466  /*
467  On error move to the end of the stream and mark the stream as bad
468  */
469  which = (std::numeric_limits<decltype(which)>::max)();
470  source.status = FAIL;
471  }
472 
473  return postings;
474  }
475 
476  /*
477  CIFF_LIN::POSTINGS_LIST_ITERATOR::OPERATOR!=()
478  ----------------------------------------------
479  */
485  {
486  bool answer = which <= with.which;
487  source.stream = stream;
488  return answer;
489  }
490  };
491 
492  /*
493  CLASS CIFF_LIN::POSTINGS_FOREACH
494  --------------------------------
495  */
500  {
501  private:
502  ciff_lin &parent;
503 
504  public:
505  postings_foreach(ciff_lin &object) :
506  parent(object)
507  {
508  /* Nothing */
509  }
510 
511  /*
512  CIFF_LIN::POSTINGS_FOREACH::BEGIN()
513  -----------------------------------
514  */
519  {
520  return ++ciff_lin::postings_list_iterator(parent, 0);
521  }
522 
523  /*
524  CIFF_LIN::POSTINGS_FOREACH::END()
525  ---------------------------------
526  */
531  {
533  }
534  };
535 
536  /*
537  CLASS CIFF_LIN::DOCRECORDS_ITERATOR
538  -----------------------------------
539  */
544  {
545  private:
547  const uint8_t *stream;
548  size_t which;
550 
551  public:
552 
553  /*
554  CIFF_LIN::DOCRECORDS_ITERATOR::DOCRECORDS_ITERATOR()
555  ----------------------------------------------------
556  */
562  docrecords_iterator(ciff_lin &of, size_t which) :
563  source(of),
564  stream(source.stream),
565  which(which)
566  {
567  /* Nothing */
568  }
569 
570  /*
571  CIFF_LIN::DOCRECORDS_ITERATOR::OPERATOR++()
572  -------------------------------------------
573  */
579  {
580  current.clear();
581 
582  if (doc_record::get_next(current, stream) == FAIL)
583  {
584  /*
585  On error move to the end of the stream and mark the stream as bad
586  */
587  which = (std::numeric_limits<decltype(which)>::max)();
588  source.status = FAIL;
589  }
590 
591  which++;
592  return *this;
593  }
594 
595  /*
596  CIFF_LIN::DOCRECORDS_ITERATOR::OPERATOR*()
597  ------------------------------------------
598  */
604  {
605  return current;
606  }
607 
608  /*
609  CIFF_LIN::DOCRECORDS_ITERATOR::OPERATOR!=()
610  -------------------------------------------
611  */
617  {
618  return which <= with.which;
619  }
620  };
621 
622  /*
623  CLASS CIFF_LIN::DOCRECORDS_FOREACH
624  ----------------------------------
625  */
630  {
631  private:
632  ciff_lin &parent;
633 
634  public:
635  docrecords_foreach(ciff_lin &object) :
636  parent(object)
637  {
638  /* Nothing */
639  }
640 
641  /*
642  CIFF_LIN::DOCRECORDS_FOREACH::BEGIN()
643  -------------------------------------
644  */
649  {
650  return ++ciff_lin::docrecords_iterator(parent, 0);
651  }
652 
653  /*
654  CIFF_LIN::DOCRECORDS_FOREACH::END()
655  -----------------------------------
656  */
661  {
662  return ciff_lin::docrecords_iterator(parent, parent.ciff_header.num_docs);
663  }
664  };
665 
666  private:
667  const uint8_t *source_file;
668  const uint8_t *stream;
670 
671  public:
673 
674  protected:
675  /*
676  READ_HEADER()
677  -------------
678  */
685  {
686  stream = source_file;
687  size_t length = protobuf::get_uint64_t(stream);
688  const uint8_t *stream_end = stream + length;
689 
690  while (stream < stream_end)
691  {
692  protobuf::wire_type type;
693  uint8_t field = protobuf::get_type_and_field(type, stream);
694 
695  if (type == protobuf::VARINT)
696  {
697  uint64_t value = protobuf::get_uint64_t(stream);
698 
699  if (field == 1)
700  header.version = (uint32_t)value;
701  else if (field == 2)
702  header.num_postings_lists = (uint32_t)value;
703  else if (field == 3)
704  header.num_docs = (uint32_t)value;
705  else if (field == 4)
706  header.total_postings_lists = (uint32_t)value;
707  else if (field == 5)
708  header.total_docs = (uint32_t)value;
709  else if (field == 6)
710  header.total_terms_in_collection = value;
711  else
712  return FAIL;
713  }
714  else if (type == protobuf::SIXTY_FOUR_BIT)
715  {
716  double value = protobuf::get_double(stream);
717  if (field == 7)
718  header.average_doclength = value;
719  else
720  return FAIL;
721  }
722  else if (type == protobuf::BLOB)
723  {
724  slice term = protobuf::get_blob(stream);
725 
726  if (field == 8)
727  header.description = std::string(reinterpret_cast<char *>(term.address()), term.size());
728  else
729  return FAIL;
730  }
731  else
732  return FAIL;
733  }
734  return OK;
735  }
736 
737  public:
738  /*
739  CIFF_LIN::CIFF_LIN()
740  --------------------
741  */
747  ciff_lin(const uint8_t *source_file) :
748  source_file(source_file),
749  stream(source_file),
750  status(OK)
751  {
752  read_header(ciff_header);
753  ciff_header.text_render();
754  }
755 
756  /*
757  CIFF_LIN::POSTINGS()
758  --------------------
759  */
765  {
766  return postings_foreach(*this);
767  }
768 
769  /*
770  CIFF_LIN::DOCRECORDS()
771  ----------------------
772  */
778  {
779  return docrecords_foreach(*this);
780  }
781 
782  /*
783  CIFF_LIN::GET_HEADER()
784  ----------------------
785  */
791  {
792  return ciff_header;
793  }
794  } ;
795  }
const uint8_t * source_file
The CIFF file in memory.
Definition: ciff_lin.h:667
uint64_t collection_frequency
The number of times the term occurs in the collection.
Definition: ciff_lin.h:212
int64_t total_terms_in_collection
The total number of terms in the entire collection. This is the sum of all document lengths of all do...
Definition: ciff_lin.h:155
error_code status
OK or FAIL (FAIL only on error in input stream)
Definition: ciff_lin.h:672
const uint8_t * stream
Where in the source stream we are reading from.
Definition: ciff_lin.h:547
postings_list_iterator(ciff_lin &of, size_t which)
Constructor.
Definition: ciff_lin.h:429
void clear(void)
removes all content from the postings list
Definition: ciff_lin.h:308
A <docid, tf> tuple.
Definition: posting.h:28
error_code read_header(header &header)
Read the CIFF header containing details about how many postings lists, etc.
Definition: ciff_lin.h:684
size_t which
Which doc record we are current on (counting from 0)
Definition: ciff_lin.h:548
ciff_lin::docrecords_iterator end()
Return and iterator to the end of this object.
Definition: ciff_lin.h:660
iterator class for iterating over an index
Definition: ciff_lin.h:543
int32_t num_docs
Exactly the number of DocRecord messages that follow the PostingsList messages.
Definition: ciff_lin.h:152
uint64_t document_frequency
The number of documents containing the term.
Definition: ciff_lin.h:211
C++ slices (string-descriptors)
Definition: slice.h:27
bool operator!=(docrecords_iterator &with)
Compare two iterators.
Definition: ciff_lin.h:616
The header of the CIFF file, it happens first in the file and describes how many postings and documen...
Definition: ciff_lin.h:147
const uint8_t * stream
Where in the CIFF we currently are.
Definition: ciff_lin.h:668
void * address(void) const
Extract the pointer value from the slice.
Definition: slice.h:269
header ciff_header
The header from the CIFF file.
Definition: ciff_lin.h:669
Method completed successfully.
Definition: ciff_lin.h:136
std::vector< posting > postings
The postings list, a vector of <d,tf> tuples.
Definition: ciff_lin.h:213
size_t which
Which postings list we are current on (counting from 0)
Definition: ciff_lin.h:416
Variable byte encoded integer of up-to 64 bits, signed or unsigned.
Definition: protobuf.h:44
static error_code get_next_postings_pair(posting &into, const uint8_t *&stream, const uint8_t *stream_end)
read a <d,tf> pair from a protobuf encoded stream
Definition: ciff_lin.h:227
slice term
The term as a <pointer,length> tuple.
Definition: ciff_lin.h:210
docrecords_foreach docrecords(void)
Return an object capable of being an iterator for document details. Assumes the "file pointer" is in ...
Definition: ciff_lin.h:777
double average_doclength
The average document length. We store this value explicitly in case the exporting application wants a...
Definition: ciff_lin.h:156
header()
Constructor.
Definition: ciff_lin.h:167
bool operator!=(postings_list_iterator &with)
Compare two iterators.
Definition: ciff_lin.h:484
Hand crafted methods to read protocol buffer (protobuf) encoded files.
ciff_lin & source
The ciff_lin being iterated over.
Definition: ciff_lin.h:546
slice collection_docid
The primary key as a <pointer,length> tuple.
Definition: ciff_lin.h:328
A postings list with a term, df, cf, and postings list of <d,tf> pairs.
Definition: ciff_lin.h:207
Method did not completed successfully.
Definition: ciff_lin.h:137
ciff_lin::postings_list_iterator end()
Return and iterator to the end of this object.
Definition: ciff_lin.h:530
doc_record & operator*()
Return the most recently constructed postings list.
Definition: ciff_lin.h:603
docrecords_iterator & operator++()
Move on to the next postings list by constructing and storing the current one. On error move to the e...
Definition: ciff_lin.h:578
void clear(void)
Clear the contents of this doc record.
Definition: ciff_lin.h:353
static error_code get_next_postings(postings_list &into, const uint8_t *&stream, const uint8_t *stream_end)
read a full postings list (inclding term, df, cf, etc.) from a protobuf encoded stream ...
Definition: ciff_lin.h:262
ciff_lin & source
The ciff_lin being iterated over.
Definition: ciff_lin.h:414
void clear(void)
Construct an empty slice with a pool allocator.
Definition: slice.h:223
ciff_lin::docrecords_iterator begin()
Return and iterator to the start of this object.
Definition: ciff_lin.h:648
Tuple for a posting in a traditions <d,tf> postings list.
ciff_lin::postings_list_iterator begin()
Return and iterator to the start of this object.
Definition: ciff_lin.h:518
postings_list postings
The postings list we just made.
Definition: ciff_lin.h:417
Little endian 64-bit number, integer or floating point types.
Definition: protobuf.h:45
const uint8_t * stream
Where in the source stream we are reading from.
Definition: ciff_lin.h:415
postings_foreach postings(void)
Return an object capable of being an iterator for postings lists. Assumes the "file pointer" is in th...
Definition: ciff_lin.h:764
postings_list_iterator & operator++()
Move on to the next postings list by constructing and storing the current one. On error move to the e...
Definition: ciff_lin.h:445
postings_list & operator*()
Return the most recently constructed postings list.
Definition: ciff_lin.h:459
static error_code get_next(doc_record &into, const uint8_t *&stream)
Get the next doc record from the stream.
Definition: ciff_lin.h:367
std::string description
Description of this index, meant for human consumption. Describing, for example, the exporting applic...
Definition: ciff_lin.h:157
void text_render(void)
Dump the value of the header to stdout.
Definition: ciff_lin.h:187
An object used to allow iteration over document records.
Definition: ciff_lin.h:629
int32_t num_postings_lists
Exactly the number of PostingsList messages that follow the Header.
Definition: ciff_lin.h:151
Reader for Jimmy Lin&#39;s shared index format.
Definition: ciff_lin.h:127
header & get_header(void)
Return the header object.
Definition: ciff_lin.h:790
int32_t version
Version number of the CIFF standard.
Definition: ciff_lin.h:150
compress_integer::integer docid
The Document identifier.
Definition: posting.h:31
error_code
success or failure.
Definition: ciff_lin.h:134
Definition: compress_integer_elias_delta_simd.c:23
iterator class for iterating over an index
Definition: ciff_lin.h:411
static uint8_t get_type_and_field(wire_type &type, const uint8_t *&stream)
Extract the field number and its type from the stream.
Definition: protobuf.h:259
ciff_lin(const uint8_t *source_file)
Constructor.
Definition: ciff_lin.h:747
docrecords_iterator(ciff_lin &of, size_t which)
Constructor.
Definition: ciff_lin.h:562
int32_t docid
The docid in the postings lists.
Definition: ciff_lin.h:327
size_t size(void) const
Return the length of this slice.
Definition: slice.h:256
int32_t doclength
The length of the document.
Definition: ciff_lin.h:329
doc_record current
The doc record list we just made.
Definition: ciff_lin.h:549
wire_type
The known protobuf types for protobuf 3.
Definition: protobuf.h:42
int32_t total_postings_lists
The total number of postings lists in the collection; the vocabulary size. This might differ from num...
Definition: ciff_lin.h:153
compress_integer::integer term_frequency
The number of times the term occurs in document with id docid.
Definition: posting.h:32
a document record object containing document lengths and primary keys
Definition: ciff_lin.h:324
doc_record()
Constructor.
Definition: ciff_lin.h:339
An object used to allow iteration over postings lists.
Definition: ciff_lin.h:499
Blob or string. length VARINT then data.
Definition: protobuf.h:46
int32_t total_docs
The total number of documents in the collection; might differ from num_doc_records for a similar reas...
Definition: ciff_lin.h:154