JASSv2
parser_fasta.h
Go to the documentation of this file.
1 /*
2  PARSER_FASTA.H
3  --------------
4  Copyright (c) 2019 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
13 #pragma once
14 
15 #include <stdint.h>
16 
17 #include "slice.h"
18 #include "parser.h"
19 #include "document.h"
20 
21 namespace JASS
22  {
23  /*
24  CLASS PARSER_FASTA
25  ------------------
26  */
34  class parser_fasta : public parser
35  {
36  private:
41  {
42  TEXT,
43  DNA
44  };
45 
46  private:
47  size_t kmer_length;
50 
51  protected:
52  /*
53  PARSER_FASTA::GET_NEXT_TOKEN_DNA()
54  ----------------------------------
55  */
60  const class parser::token &get_next_token_dna(void);
61 
62  public:
63  /*
64  PARSER_FASTA::PARSER_FASTA()
65  ----------------------------
66  */
70  parser_fasta(size_t kmer_length) :
71  kmer_length(kmer_length),
72  mode(TEXT),
73  end_of_fasta_document(nullptr)
74  {
75  /*
76  We can only return alpha tokens or eof tokens so set the token type here to alpha
77  */
79  current_token.count = 1;
80  }
81 
82  /*
83  PARSER_FASTA::~PARSER_FASTA()
84  -----------------------------
85  */
89  virtual ~parser_fasta()
90  {
91  /*
92  Nothing
93  */
94  }
95 
96  /*
97  PARSER_FASTA::SET_DOCUMENT()
98  ----------------------------
99  */
106  virtual void set_document(const class document &document)
107  {
108  the_document = &document;
109  current = (uint8_t *)document.contents.address();
110  end_of_fasta_document = (uint8_t *)document.contents.address() + document.contents.size();
111 
112  /*
113  Start in TEXT mode and "pretend" that the document stops at the start of the DNA (the end of the first line)
114  */
115  mode = TEXT;
116  end_of_document = static_cast<uint8_t *>(std::find(static_cast<uint8_t *>(document.contents.address()), end_of_fasta_document, '\n'));
117  }
118 
119  /*
120  PARSER_FASTA::GET_NEXT_TOKEN()
121  ------------------------------
122  */
127  virtual const class parser::token &get_next_token(void)
128  {
129  if (mode == DNA)
130  return get_next_token_dna();
131  else
132  {
133  const token &got = parser::get_next_token();
134  if (got.type == token::token_type::eof)
135  {
136  mode = DNA;
137  end_of_document = end_of_fasta_document; // shift to the end of document being the end of the FASTA document not just the end of the text.
138  return get_next_token_dna();
139  }
140  return got;
141  }
142  }
143 
144  /*
145  PARSER_FASTA::UNITTEST()
146  ------------------------
147  */
151  static void unittest(void);
152  };
153  }
const class parser::token & get_next_token_dna(void)
Continue parsing the input looking for the next DNA k-mer token.
Definition: parser_fasta.cpp:24
token current_token
The token that is currently being build. A reference to this is returned when the token is complete...
Definition: parser.h:127
virtual ~parser_fasta()
Destructor.
Definition: parser_fasta.h:89
Simple, but fast, XML parser.
Definition: parser.h:39
void * address(void) const
Extract the pointer value from the slice.
Definition: slice.h:269
const uint8_t * end_of_document
Pointer to the end of the document, used to avoid read past end of buffer.
Definition: parser.h:126
Container class representing a document through the indexing pipeline.
Definition: document.h:31
parser_mode mode
The mode (TEXT or DNA) of the tokenizer;.
Definition: parser_fasta.h:48
Parser to turn DNA sequences in FASTA format into k-mers for indexing.
Definition: parser_fasta.h:34
character is a DNA base (i.e in: {ACTGactg})
Definition: ascii_database_to_c.cpp:35
Simple XML parser that does&#39;t do either attributes or entities.
slice contents
The contents of the document (or likewise).
Definition: document.h:43
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token.
Definition: parser_fasta.h:127
A document withing the indexing pipeline.
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token.
Definition: parser.cpp:79
uint8_t * end_of_fasta_document
Pointer to the end of the FASTA document, end_of_document points to the end of the first line (the pr...
Definition: parser_fasta.h:49
parser_fasta(size_t kmer_length)
Constructor.
Definition: parser_fasta.h:70
static void unittest(void)
Unit test this class.
Definition: parser_fasta.cpp:92
token_type type
The type of this token (See token_type)
Definition: parser.h:86
size_t kmer_length
The length of the k-mers to compute from the DNA sequences.
Definition: parser_fasta.h:47
const uint8_t * current
The current location within the document.
Definition: parser.h:125
const document * the_document
The document that is currently being parsed.
Definition: parser.h:124
A token as returned by the parser.
Definition: parser.h:58
parser_mode
Definition: parser_fasta.h:40
Slices (also known as string-descriptors) for C++.
virtual void set_document(const class document &document)
Start parsing from the start of this document.
Definition: parser_fasta.h:106
Definition: compress_integer_elias_delta_simd.c:23
alphabetic token
Definition: parser.h:67
size_t size(void) const
Return the length of this slice.
Definition: slice.h:256
index_postings_impact::impact_type count
The number of times the token is seen (normally 1, but if parsing a forward index it might be known t...
Definition: parser.h:87