JASSv2
instream_document_trec.h
Go to the documentation of this file.
1 /*
2  INSTREAM_DOCUMENT_TREC.H
3  ------------------------
4  Copyright (c) 2016 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
14 #pragma once
15 
16 #include <stdint.h>
17 
18 #include <string>
19 
20 #include "slice.h"
21 #include "instream.h"
22 
23 namespace JASS
24  {
25  /*
26  CLASS INSTREAM_DOCUMENT_TREC
27  ----------------------------
28  */
36  {
37  protected:
38  size_t buffer_size;
39 
40  protected:
41  uint8_t *buffer;
42  uint8_t *buffer_end;
43  size_t buffer_used;
44 
45  std::string document_start_tag;
46  std::string document_end_tag;
47  std::string primary_key_start_tag;
48  std::string primary_key_end_tag;
49 
50  protected:
51  /*
52  INSTREAM_DOCUMENT_TREC::INSTREAM_DOCUMENT_TREC()
53  ------------------------------------------------
54  */
62  instream_document_trec(std::shared_ptr<instream> &source, size_t buffer_size, const std::string &document_tag, const std::string &document_primary_key_tag);
63 
64  /*
65  INSTREAM_DOCUMENT_TREC::SET_TAGS()
66  ----------------------------------
67  */
73  void set_tags(const std::string &document_tag, const std::string &primary_key_tag);
74 
75  /*
76  INSTREAM_DOCUMENT_TREC::FETCH()
77  -------------------------------
78  */
84  void fetch(void *buffer, size_t bytes)
85  {
86  buffer_end = (uint8_t *)buffer + source->fetch(buffer, bytes);
87  }
88 
89  public:
90  /*
91  INSTREAM_DOCUMENT_TREC::INSTREAM_DOCUMENT_TREC()
92  ------------------------------------------------
93  */
98  instream_document_trec(const instream_document_trec &previous) = delete;
99 
100  /*
101  INSTREAM_DOCUMENT_TREC::INSTREAM_DOCUMENT_TREC()
102  ------------------------------------------------
103  */
110  instream_document_trec(std::shared_ptr<instream> &source, const std::string &document_tag = "DOC", const std::string &document_primary_key_tag = "DOCNO");
111 
112  /*
113  INSTREAM_DOCUMENT_TREC::INSTREAM_DOCUMENT_TREC()
114  ------------------------------------------------
115  */
119  virtual ~instream_document_trec();
120 
121  /*
122  INSTREAM_DOCUMENT_TREC::READ()
123  ------------------------------
124  */
129  virtual void read(document &buffer);
130 
131  /*
132  INSTREAM_DOCUMENT_TREC::UNITTEST()
133  ----------------------------------
134  */
138  static void unittest(void);
139  } ;
140 }
void set_tags(const std::string &document_tag, const std::string &primary_key_tag)
Register the document tag and the primary key tag. Used to set up internal data structures.
Definition: instream_document_trec.cpp:68
std::string primary_key_start_tag
The primary key&#39;s start tag ("<DOCNO>" by default)
Definition: instream_document_trec.h:47
std::shared_ptr< instream > source
If this object is reading from another instream then this is that instream.
Definition: instream.h:48
Read data from an input stream.
Definition: instream.h:45
virtual ~instream_document_trec()
Destructor.
Definition: instream_document_trec.cpp:59
Container class representing a document through the indexing pipeline.
Definition: document.h:31
uint8_t * buffer
Pointer to the interal buffer from which documents are extracted. Filled by calling source...
Definition: instream_document_trec.h:41
size_t buffer_used
The number of bytes of buffer that have already been used from buffer (buffer + buffer_used is a poin...
Definition: instream_document_trec.h:43
Base class for reading data from some input source.
uint8_t * buffer_end
Pointer to the end of the buffer (used to prevent read past EOF).
Definition: instream_document_trec.h:42
static void unittest(void)
Unit test this class.
Definition: instream_document_trec.cpp:174
void fetch(void *buffer, size_t bytes)
Fetch another block of data from the source.
Definition: instream_document_trec.h:84
std::string document_end_tag
The end tag used to mark the end of a document ("</DOC>" by defaut)
Definition: instream_document_trec.h:46
Child class of instream for creating documents from TREC pre-web (i.e. news articles) data...
Definition: instream_document_trec.h:35
Slices (also known as string-descriptors) for C++.
Definition: compress_integer_elias_delta_simd.c:23
instream_document_trec(std::shared_ptr< instream > &source, size_t buffer_size, const std::string &document_tag, const std::string &document_primary_key_tag)
Protected constructor used to set the size of the internal buffer in the unittest.
Definition: instream_document_trec.cpp:25
size_t buffer_size
Size of the disk read buffer. Normally 16MB.
Definition: instream_document_trec.h:38
std::string document_start_tag
The start tag used to delineate documents ("<DOC>" be default)
Definition: instream_document_trec.h:45
virtual void read(document &buffer)
Read the next document from the source instream into document.
Definition: instream_document_trec.cpp:83
std::string primary_key_end_tag
The primary key&#39;s end tag ("</DOCNO>" by default)
Definition: instream_document_trec.h:48