15 #ifndef RAPIDJSON_INTERNAL_REGEX_H_ 16 #define RAPIDJSON_INTERNAL_REGEX_H_ 18 #include "../allocators.h" 19 #include "../stream.h" 24 RAPIDJSON_DIAG_OFF(padded)
25 RAPIDJSON_DIAG_OFF(switch-enum)
26 RAPIDJSON_DIAG_OFF(implicit-fallthrough)
31 RAPIDJSON_DIAG_OFF(effc++)
36 RAPIDJSON_DIAG_OFF(4512)
39 #ifndef RAPIDJSON_REGEX_VERBOSE 40 #define RAPIDJSON_REGEX_VERBOSE 0 49 template <
typename SourceStream,
typename Encoding>
52 DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); }
53 unsigned Peek() {
return codepoint_; }
55 unsigned c = codepoint_;
63 if (!Encoding::Decode(ss_, &codepoint_))
77 template <
typename Encoding,
typename Allocator>
112 template <
typename Encoding,
typename Allocator = CrtAllocator>
115 typedef Encoding EncodingType;
116 typedef typename Encoding::Ch Ch;
119 GenericRegex(
const Ch* source, Allocator* allocator = 0) :
120 states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
121 anchorBegin_(), anchorEnd_()
130 bool IsValid()
const {
131 return root_ != kRegexInvalidState;
144 static const unsigned kAnyCharacterClass = 0xFFFFFFFF;
145 static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
146 static const unsigned kRangeNegationFlag = 0x80000000;
170 return states_.template Bottom<State>()[index];
173 const State& GetState(
SizeType index)
const {
175 return states_.template Bottom<State>()[index];
180 return ranges_.template Bottom<Range>()[index];
183 const Range& GetRange(
SizeType index)
const {
185 return ranges_.template Bottom<Range>()[index];
188 template <
typename InputStream>
195 *atomCountStack.template Push<unsigned>() = 0;
198 while (ds.Peek() != 0) {
199 switch (codepoint = ds.Take()) {
209 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
210 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
212 *operatorStack.template Push<Operator>() = kAlternation;
213 *atomCountStack.template Top<unsigned>() = 0;
217 *operatorStack.template Push<Operator>() = kLeftParenthesis;
218 *atomCountStack.template Push<unsigned>() = 0;
222 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
223 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
225 if (operatorStack.Empty())
227 operatorStack.template Pop<Operator>(1);
228 atomCountStack.template Pop<unsigned>(1);
229 ImplicitConcatenation(atomCountStack, operatorStack);
233 if (!Eval(operandStack, kZeroOrOne))
238 if (!Eval(operandStack, kZeroOrMore))
243 if (!Eval(operandStack, kOneOrMore))
250 if (!ParseUnsigned(ds, &n))
253 if (ds.Peek() ==
',') {
255 if (ds.Peek() ==
'}')
256 m = kInfinityQuantifier;
257 else if (!ParseUnsigned(ds, &m) || m < n)
263 if (!EvalQuantifier(operandStack, n, m) || ds.Peek() !=
'}')
270 PushOperand(operandStack, kAnyCharacterClass);
271 ImplicitConcatenation(atomCountStack, operatorStack);
277 if (!ParseRange(ds, &range))
279 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
280 GetState(s).rangeStart = range;
281 *operandStack.template Push<Frag>() = Frag(s, s, s);
283 ImplicitConcatenation(atomCountStack, operatorStack);
287 if (!CharacterEscape(ds, &codepoint))
292 PushOperand(operandStack, codepoint);
293 ImplicitConcatenation(atomCountStack, operatorStack);
297 while (!operatorStack.Empty())
298 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
302 if (operandStack.GetSize() ==
sizeof(Frag)) {
303 Frag* e = operandStack.template Pop<Frag>(1);
304 Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
307 #if RAPIDJSON_REGEX_VERBOSE 308 printf(
"root: %d\n", root_);
309 for (
SizeType i = 0; i < stateCount_ ; i++) {
310 State& s = GetState(i);
311 printf(
"[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (
char)s.codepoint);
319 State* s = states_.template Push<State>();
322 s->codepoint = codepoint;
323 s->rangeStart = kRegexInvalidRange;
324 return stateCount_++;
328 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
329 *operandStack.template Push<Frag>() = Frag(s, s, s);
333 if (*atomCountStack.template Top<unsigned>())
334 *operatorStack.template Push<Operator>() = kConcatenation;
335 (*atomCountStack.template Top<unsigned>())++;
340 while (GetState(l1).out != kRegexInvalidState)
341 l1 = GetState(l1).out;
342 GetState(l1).out = l2;
347 for (
SizeType next; l != kRegexInvalidState; l = next) {
348 next = GetState(l).out;
358 Frag e2 = *operandStack.template Pop<Frag>(1);
359 Frag e1 = *operandStack.template Pop<Frag>(1);
360 Patch(e1.out, e2.start);
361 *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
366 if (operandStack.GetSize() >=
sizeof(Frag) * 2) {
367 Frag e2 = *operandStack.template Pop<Frag>(1);
368 Frag e1 = *operandStack.template Pop<Frag>(1);
369 SizeType s = NewState(e1.start, e2.start, 0);
370 *operandStack.template Push<Frag>() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
376 if (operandStack.GetSize() >=
sizeof(Frag)) {
377 Frag e = *operandStack.template Pop<Frag>(1);
378 SizeType s = NewState(kRegexInvalidState, e.start, 0);
379 *operandStack.template Push<Frag>() = Frag(s, Append(e.out, s), e.minIndex);
385 if (operandStack.GetSize() >=
sizeof(Frag)) {
386 Frag e = *operandStack.template Pop<Frag>(1);
387 SizeType s = NewState(kRegexInvalidState, e.start, 0);
389 *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
396 if (operandStack.GetSize() >=
sizeof(Frag)) {
397 Frag e = *operandStack.template Pop<Frag>(1);
398 SizeType s = NewState(kRegexInvalidState, e.start, 0);
400 *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
407 bool EvalQuantifier(
Stack<Allocator>& operandStack,
unsigned n,
unsigned m) {
414 else if (m == kInfinityQuantifier)
415 Eval(operandStack, kZeroOrMore);
417 Eval(operandStack, kZeroOrOne);
418 for (
unsigned i = 0; i < m - 1; i++)
419 CloneTopOperand(operandStack);
420 for (
unsigned i = 0; i < m - 1; i++)
421 Eval(operandStack, kConcatenation);
426 for (
unsigned i = 0; i < n - 1; i++)
427 CloneTopOperand(operandStack);
429 if (m == kInfinityQuantifier)
430 Eval(operandStack, kOneOrMore);
432 CloneTopOperand(operandStack);
433 Eval(operandStack, kZeroOrOne);
434 for (
unsigned i = n; i < m - 1; i++)
435 CloneTopOperand(operandStack);
436 for (
unsigned i = n; i < m; i++)
437 Eval(operandStack, kConcatenation);
440 for (
unsigned i = 0; i < n - 1; i++)
441 Eval(operandStack, kConcatenation);
449 const Frag src = *operandStack.template Top<Frag>();
450 SizeType count = stateCount_ - src.minIndex;
451 State* s = states_.template Push<State>(count);
452 memcpy(s, &GetState(src.minIndex), count *
sizeof(State));
453 for (
SizeType j = 0; j < count; j++) {
454 if (s[j].out != kRegexInvalidState)
456 if (s[j].out1 != kRegexInvalidState)
459 *operandStack.template Push<Frag>() = Frag(src.start + count, src.out + count, src.minIndex + count);
460 stateCount_ += count;
463 template <
typename InputStream>
466 if (ds.Peek() <
'0' || ds.Peek() >
'9')
468 while (ds.Peek() >=
'0' && ds.Peek() <=
'9') {
469 if (r >= 429496729 && ds.Peek() >
'5')
471 r = r * 10 + (ds.Take() -
'0');
477 template <
typename InputStream>
482 SizeType start = kRegexInvalidRange;
483 SizeType current = kRegexInvalidRange;
485 while ((codepoint = ds.Take()) != 0) {
488 if (codepoint ==
'^') {
496 if (start == kRegexInvalidRange)
501 GetRange(current).next = r;
504 GetRange(start).start |= kRangeNegationFlag;
509 if (ds.Peek() ==
'b') {
513 else if (!CharacterEscape(ds, &codepoint))
520 if (codepoint ==
'-') {
529 if (current != kRegexInvalidRange)
530 GetRange(current).next = r;
531 if (start == kRegexInvalidRange)
540 GetRange(current).end = codepoint;
548 SizeType NewRange(
unsigned codepoint) {
549 Range* r = ranges_.template Push<Range>();
550 r->start = r->end = codepoint;
551 r->next = kRegexInvalidRange;
552 return rangeCount_++;
555 template <
typename InputStream>
558 switch (codepoint = ds.Take()) {
573 *escapedCodepoint = codepoint;
return true;
574 case 'f': *escapedCodepoint = 0x000C;
return true;
575 case 'n': *escapedCodepoint = 0x000A;
return true;
576 case 'r': *escapedCodepoint = 0x000D;
return true;
577 case 't': *escapedCodepoint = 0x0009;
return true;
578 case 'v': *escapedCodepoint = 0x000B;
return true;
590 static const unsigned kInfinityQuantifier = ~0u;
597 template <
typename RegexType,
typename Allocator = CrtAllocator>
600 typedef typename RegexType::EncodingType Encoding;
601 typedef typename Encoding::Ch Ch;
604 regex_(regex), allocator_(allocator), ownAllocator_(0),
605 state0_(allocator, 0), state1_(allocator, 0), stateSet_()
610 stateSet_ =
static_cast<unsigned*
>(allocator_->Malloc(GetStateSetSize()));
611 state0_.template Reserve<SizeType>(regex_.stateCount_);
612 state1_.template Reserve<SizeType>(regex_.stateCount_);
616 Allocator::Free(stateSet_);
620 template <
typename InputStream>
621 bool Match(InputStream& is) {
622 return SearchWithAnchoring(is,
true,
true);
625 bool Match(
const Ch* s) {
630 template <
typename InputStream>
631 bool Search(InputStream& is) {
632 return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_);
635 bool Search(
const Ch* s) {
641 typedef typename RegexType::State State;
642 typedef typename RegexType::Range Range;
644 template <
typename InputStream>
645 bool SearchWithAnchoring(InputStream& is,
bool anchorBegin,
bool anchorEnd) {
650 const size_t stateSetSize = GetStateSetSize();
651 std::memset(stateSet_, 0, stateSetSize);
653 bool matched = AddState(*current, regex_.root_);
655 while (!current->Empty() && (codepoint = ds.Take()) != 0) {
656 std::memset(stateSet_, 0, stateSetSize);
659 for (
const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
660 const State& sr = regex_.GetState(*s);
661 if (sr.codepoint == codepoint ||
662 sr.codepoint == RegexType::kAnyCharacterClass ||
663 (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
665 matched = AddState(*next, sr.out) || matched;
666 if (!anchorEnd && matched)
670 AddState(*next, regex_.root_);
672 internal::Swap(current, next);
678 size_t GetStateSetSize()
const {
679 return (regex_.stateCount_ + 31) / 32 * 4;
686 const State& s = regex_.GetState(index);
687 if (s.out1 != kRegexInvalidState) {
688 bool matched = AddState(l, s.out);
689 return AddState(l, s.out1) || matched;
691 else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
692 stateSet_[index >> 5] |= (1 << (index & 31));
693 *l.template PushUnsafe<SizeType>() = index;
695 return s.out == kRegexInvalidState;
698 bool MatchRange(
SizeType rangeIndex,
unsigned codepoint)
const {
699 bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0;
700 while (rangeIndex != kRegexInvalidRange) {
701 const Range& r = regex_.GetRange(rangeIndex);
702 if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) && codepoint <= r.end)
709 const RegexType& regex_;
710 Allocator* allocator_;
711 Allocator* ownAllocator_;
731 #endif // RAPIDJSON_INTERNAL_REGEX_H_ RAPIDJSON_NAMESPACE_BEGIN typedef unsigned SizeType
Size type (for string lengths, array sizes, etc.)
Definition: rapidjson.h:380
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:402
Read-only string stream.
Definition: fwd.h:47
#define RAPIDJSON_NAMESPACE_END
provide custom rapidjson namespace (closing expression)
Definition: rapidjson.h:119
A type-unsafe stack for storing different types of data.
Definition: stack.h:36
Regular expression engine with subset of ECMAscript grammar.
Definition: regex.h:113
#define RAPIDJSON_NAMESPACE_BEGIN
provide custom rapidjson namespace (opening expression)
Definition: rapidjson.h:116
#define RAPIDJSON_NEW(x)
! customization point for global new
Definition: rapidjson.h:586
#define RAPIDJSON_DELETE(x)
! customization point for global delete
Definition: rapidjson.h:590
Definition: document.h:399