Merge branch 'feature/re2'

This commit is contained in:
Matt Hill
2015-07-29 00:17:14 -04:00
79 changed files with 28450 additions and 33251 deletions

View File

@@ -29,6 +29,7 @@ namespace alpr
{ {
RegexRule::RegexRule(string region, string pattern) RegexRule::RegexRule(string region, string pattern)
//: re2_regex("")
{ {
this->original = pattern; this->original = pattern;
this->region = region; this->region = region;
@@ -79,11 +80,11 @@ namespace alpr
} }
else if (utf_character == "@") else if (utf_character == "@")
{ {
regexval << "\\" << "p" << "{Alpha}"; regexval << "\\pL";
} }
else if (utf_character == "#") else if (utf_character == "#")
{ {
regexval << "\\" << "p" << "{Digit}"; regexval << "\\pN";
} }
else if ((utf_character == "*") || (utf_character == "+")) else if ((utf_character == "*") || (utf_character == "+"))
{ {
@@ -98,20 +99,12 @@ namespace alpr
} }
this->regex = regexval.str(); this->regex = regexval.str();
// Onigurama is not thread safe when compiling regex. Using a mutex to ensure that
// we don't crash
regexrule_mutex_m.lock();
UChar* cstr_pattern = (UChar* )this->regex.c_str();
OnigErrorInfo einfo;
int r = onig_new(&onig_regex, cstr_pattern, cstr_pattern + strlen((char* )cstr_pattern), re2_regex = new re2::RE2(this->regex);
ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_DEFAULT, &einfo);
regexrule_mutex_m.unlock();
if (r != ONIG_NORMAL) {
//char s[ONIG_MAX_ERROR_MESSAGE_LEN];
//onig_error_code_to_str(s, r, &einfo); if (!re2_regex->ok()) {
cerr << "Unable to load regex: " << pattern << endl; cerr << "Unable to load regex: " << pattern << endl;
} }
else else
@@ -123,8 +116,7 @@ namespace alpr
RegexRule::~RegexRule() RegexRule::~RegexRule()
{ {
onig_free(onig_regex); delete re2_regex;
onig_end();
} }
bool RegexRule::match(string text) bool RegexRule::match(string text)
@@ -143,17 +135,9 @@ namespace alpr
if (text_char_length != numchars) if (text_char_length != numchars)
return false; return false;
OnigRegion *region = onig_region_new(); bool match = re2::RE2::FullMatch(text, *re2_regex);
unsigned char *start, *end;
UChar* cstr_text = (UChar* )text.c_str(); return match;
end = cstr_text + strlen((char* )cstr_text);
start = cstr_text;
int match = onig_match(onig_regex, cstr_text, end, start, region, ONIG_OPTION_NONE);
onig_region_free(region, 1);
return match == text.length();
} }
string RegexRule::filterSkips(string text) string RegexRule::filterSkips(string text)

View File

@@ -24,7 +24,7 @@
#include <string> #include <string>
#include <cstring> #include <cstring>
#include <vector> #include <vector>
#include "support/regex/oniguruma.h" #include "support/re2.h"
#include "support/utf8.h" #include "support/utf8.h"
#include "support/tinythread.h" #include "support/tinythread.h"
@@ -43,7 +43,7 @@ namespace alpr
bool valid; bool valid;
int numchars; int numchars;
regex_t* onig_regex; re2::RE2* re2_regex;
std::string original; std::string original;
std::string regex; std::string regex;
std::string region; std::string region;

View File

@@ -10,25 +10,39 @@ set(support_source_files
) )
set(regex_source_files set(regex_source_files
regex/regsyntax.c
regex/regposerr.c
regex/regcomp.c re2/bitstate.cc
regex/reggnu.c re2/compile.cc
regex/regerror.c re2/dfa.cc
regex/regext.c re2/filtered_re2.cc
regex/regversion.c re2/mimics_pcre.cc
regex/regparse.c re2/nfa.cc
regex/regenc.c re2/onepass.cc
regex/st.c re2/parse.cc
regex/regposix.c re2/perl_groups.cc
regex/regexec.c re2/prefilter.cc
regex/regtrav.c re2/prefilter_tree.cc
regex/ascii.c re2/prog.cc
regex/unicode.c re2/re2.cc
regex/utf8.c re2/regexp.cc
re2/set.cc
re2/simplify.cc
re2/stringpiece.cc
re2/tostring.cc
re2/unicode_casefold.cc
re2/unicode_groups.cc
re2/util/hash.cc
re2/util/stringprintf.cc
re2/util/rune.cc
re2/util/strutil.cc
re2/util/valgrind.cc
) )
include_directories(.)
add_library(support STATIC add_library(support STATIC
${support_source_files} ${support_source_files}
${regex_source_files} ${regex_source_files}

883
src/openalpr/support/re2.h Normal file
View File

@@ -0,0 +1,883 @@
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H
#define RE2_RE2_H
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
// \d, \w, \s, ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the re2 library and hence supports
// its syntax for regular expressions, which is similar to Perl's with
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See https://github.com/google/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(RE2::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!RE2::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, the pattern and input text are interpreted as UTF-8.
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
//
// Example:
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
// than PCRE), as do matches without substring extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(RE2::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PRE-COMPILED REGULAR EXPRESSIONS
//
// RE2 makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "RE2"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// RE2 pattern("h.*o");
// while (ReadLine(&str)) {
// if (RE2::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// RE2::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// USING VARIABLE NUMBER OF ARGUMENTS
//
// The above operations require you to know the number of arguments
// when you write the code. This is not always possible or easy (for
// example, the regular expression may be calculated at run time).
// You can use the "N" version of the operations when the number of
// match arguments are determined at run time.
//
// Example:
// const RE2::Arg* args[10];
// int n;
// // ... populate args with pointers to RE2::Arg values ...
// // ... set n to the number of RE2::Arg objects ...
// bool match = RE2::FullMatchN(input, pattern, args, n);
//
// The last statement is equivalent to
//
// bool match = RE2::FullMatch(input, pattern,
// *args[0], *args[1], ..., *args[n - 1]);
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stdint.h>
#include <map>
#include <string>
#include "re2/stringpiece.h"
#include "re2/variadic_function.h"
#ifndef RE2_HAVE_LONGLONG
#define RE2_HAVE_LONGLONG 1
#endif
namespace re2 {
using std::string;
using std::map;
class Mutex;
class Prog;
class Regexp;
// The following enum should be used only as a constructor argument to indicate
// that the variable has static storage class, and that the constructor should
// do nothing to its state. It indicates to the reader that it is legal to
// declare a static instance of the class, provided the constructor is given
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
// static variable that has a constructor or a destructor because invocation
// order is undefined. However, IF the type can be initialized by filling with
// zeroes (which the loader does for static variables), AND the type's
// destructor does nothing to the storage, then a constructor for static
// initialization can be declared as
// explicit MyClass(LinkerInitialized x) {}
// and invoked as
// static MyClass my_variable_name(LINKER_INITIALIZED);
enum LinkerInitialized { LINKER_INITIALIZED };
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
class RE2 {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
class Options;
// Defined in set.h.
class Set;
enum ErrorCode {
NoError = 0,
// Unexpected error
ErrorInternal,
// Parse errors
ErrorBadEscape, // bad escape sequence
ErrorBadCharClass, // bad character class
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
ErrorRepeatOp, // bad repetition operator
ErrorBadPerlOp, // bad perl operator
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge // pattern too large (compile failed)
};
// Predefined common options.
// If you need more complicated things, instantiate
// an Option class, possibly passing one of these to
// the Option constructor, change the settings, and pass that
// Option class to the RE2 constructor.
enum CannedOptions {
DefaultOptions = 0,
Latin1, // treat input as Latin-1 (default UTF-8)
POSIX, // POSIX syntax, leftmost-longest match
Quiet // do not log about regexp parse errors
};
// Need to have the const char* and const string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
// Otherwise the StringPiece form would be sufficient.
#ifndef SWIG
RE2(const char* pattern);
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& option);
~RE2();
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// If RE2 could not be created properly, returns an error code.
// Else returns RE2::NoError (== 0).
ErrorCode error_code() const { return error_code_; }
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
const string& error_arg() const { return error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
int ProgramFanout(map<int, int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
re2::Regexp* Regexp() const { return entire_regexp_; }
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(RE2::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
//
// string s = "yabba dabba doo";
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
// Replacements are not subject to re-matching.
//
// Because GlobalReplace only replaces non-overlapping matches,
// replacing "ana" within "banana" makes only one replacement, not two.
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
//
// REQUIRES: "text" must not alias any part of "*out".
static bool Extract(const StringPiece &text,
const RE2& pattern,
const StringPiece &rewrite,
string *out);
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
// Generic matching interface
// Type of match.
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH // Anchor at start and end
};
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const map<string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const map<int, string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
// and stopping the search at offset endpos.
// Returns true if match found, false if not.
// On a successful match, fills in match[] (up to nmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
//
// Don't ask for more match information than you will use:
// runs much faster with nmatch == 1 than nmatch > 1, and
// runs even faster if nmatch == 0.
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, match[i] == NULL.
bool Match(const StringPiece& text,
int startpos,
int endpos,
Anchor anchor,
StringPiece *match,
int nmatch) const;
// Check that the given rewrite string is suitable for use with this
// regular expression. It checks that:
// * The regular expression has enough parenthesized subexpressions
// to satisfy all of the \N tokens in rewrite
// * The rewrite string doesn't have any syntax errors. E.g.,
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Returns the maximum submatch needed for the rewrite to be done by
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
static int MaxSubmatch(const StringPiece& rewrite);
// Append the "rewrite" string, with backslash subsitutions from "vec",
// to string "out".
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
bool Rewrite(string *out,
const StringPiece &rewrite,
const StringPiece* vec,
int veclen) const;
// Constructor options
class Options {
public:
// The options are (defaults in parentheses):
//
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
// posix_syntax (false) restrict regexps to POSIX egrep syntax
// longest_match (false) search for longest match, not first match
// log_errors (true) log syntax and execution errors to ERROR
// max_mem (see below) approx. max memory footprint of RE2
// literal (false) interpret string as literal, not regexp
// never_nl (false) never match \n, even if it is in regexp
// dot_nl (false) dot matches everything including new line
// never_capture (false) parse all parens as non-capturing
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
//
// The following options are only consulted when posix_syntax == true.
// (When posix_syntax == false these features are always enabled and
// cannot be turned off.)
// perl_classes (false) allow Perl's \d \s \w \D \S \W
// word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
// its cached DFA graphs. Code Search placed limits on the number
// of Prog instructions and DFA states: 10,000 for both.
// In RE2, those limits would translate to about 240 KB per Prog
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
// better job of keeping them small than Code Search did).
// Each RE2 has two Progs (one forward, one reverse), and each Prog
// can have two DFAs (one first match, one longest match).
// That makes 4 DFAs:
//
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
// if opt.longest_match() == false
// forward, longest-match - used for all ANCHOR_BOTH searches,
// and the other two kinds if
// opt.longest_match() == true
// reverse, first-match - never used
// reverse, longest-match - used as second phase for unanchored searches
//
// The RE2 memory budget is statically divided between the two
// Progs and then the DFAs: two thirds to the forward Prog
// and one third to the reverse Prog. The forward Prog gives half
// of what it has left over to each of its DFAs. The reverse Prog
// gives it all to its longest-match DFA.
//
// Once a DFA fills its budget, it flushes its cache and starts over.
// If this happens too often, RE2 falls back on the NFA implementation.
// For now, make the default budget something close to Code Search.
static const int kDefaultMaxMem = 8<<20;
enum Encoding {
EncodingUTF8 = 1,
EncodingLatin1
};
Options() :
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
never_capture_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
/*implicit*/ Options(CannedOptions);
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
bool longest_match() const { return longest_match_; }
void set_longest_match(bool b) { longest_match_ = b; }
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
int64_t max_mem() const { return max_mem_; }
void set_max_mem(int64_t m) { max_mem_ = m; }
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
bool never_nl() const { return never_nl_; }
void set_never_nl(bool b) { never_nl_ = b; }
bool dot_nl() const { return dot_nl_; }
void set_dot_nl(bool b) { dot_nl_ = b; }
bool never_capture() const { return never_capture_; }
void set_never_capture(bool b) { never_capture_ = b; }
bool case_sensitive() const { return case_sensitive_; }
void set_case_sensitive(bool b) { case_sensitive_ = b; }
bool perl_classes() const { return perl_classes_; }
void set_perl_classes(bool b) { perl_classes_ = b; }
bool word_boundary() const { return word_boundary_; }
void set_word_boundary(bool b) { word_boundary_ = b; }
bool one_line() const { return one_line_; }
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
encoding_ = src.encoding_;
posix_syntax_ = src.posix_syntax_;
longest_match_ = src.longest_match_;
log_errors_ = src.log_errors_;
max_mem_ = src.max_mem_;
literal_ = src.literal_;
never_nl_ = src.never_nl_;
dot_nl_ = src.dot_nl_;
never_capture_ = src.never_capture_;
case_sensitive_ = src.case_sensitive_;
perl_classes_ = src.perl_classes_;
word_boundary_ = src.word_boundary_;
one_line_ = src.one_line_;
}
int ParseFlags() const;
private:
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
int64_t max_mem_;
bool literal_;
bool never_nl_;
bool dot_nl_;
bool never_capture_;
bool case_sensitive_;
bool perl_classes_;
bool word_boundary_;
bool one_line_;
//DISALLOW_COPY_AND_ASSIGN(Options);
Options(const Options&);
void operator=(const Options&);
};
// Returns the options set in the constructor.
const Options& options() const { return options_; };
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
#if RE2_HAVE_LONGLONG
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
#endif
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
#if RE2_HAVE_LONGLONG
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
#endif
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
#if RE2_HAVE_LONGLONG
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
#endif
private:
void Init(const StringPiece& pattern, const Options& options);
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
mutable Mutex* mutex_;
string pattern_; // string regular expression
Options options_; // option flags
string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
mutable re2::Prog* rprog_; // reverse program for regexp
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable const string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable string error_arg_; // Fragment of regexp showing error
mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
// Map from capture indices to names
mutable const map<int, string>* group_names_;
//DISALLOW_COPY_AND_ASSIGN(RE2);
RE2(const RE2&);
void operator=(const RE2&);
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type,name) \
Arg(type* p) : arg_(p), parser_(name) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_char);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
#if RE2_HAVE_LONGLONG
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
#endif
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
// Generic constructor templates
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
template <class T> Arg(T* p, Parser parser)
: arg_(p), parser_(parser) { }
// Parse the data
bool Parse(const char* str, int n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
#if RE2_HAVE_LONGLONG
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#endif
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, int n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
#if RE2_HAVE_LONGLONG
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
#endif
#undef MAKE_INTEGER_PARSER
} // namespace re2
using re2::RE2;
#endif /* RE2_RE2_H */

View File

@@ -0,0 +1,380 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Like
// testing/backtrack.cc, it allocates a bit vector with (length of
// text) * (length of prog) bits, to make sure it never explores the
// same (character position, instruction) state multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct Job {
int id;
int arg;
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
~BitState();
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p, int arg);
bool GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char** cap_; // capture registers
int ncap_;
static const int VisitedBits = 32;
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
Job *job_; // stack of text positions to explore
int njob_;
int maxjob_;
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
cap_(NULL),
ncap_(0),
visited_(NULL),
nvisited_(0),
job_(NULL),
njob_(0),
maxjob_(0) {
}
BitState::~BitState() {
delete[] visited_;
delete[] job_;
delete[] cap_;
}
// Should the search visit the pair ip, p?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
uint n = id * (text_.size() + 1) + (p - text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
return true;
}
// Grow the stack.
bool BitState::GrowStack() {
// VLOG(0) << "Reallocate.";
maxjob_ *= 2;
Job* newjob = new Job[maxjob_];
memmove(newjob, job_, njob_*sizeof job_[0]);
delete[] job_;
job_ = newjob;
if (njob_ >= maxjob_) {
LOG(DFATAL) << "Job stack overflow.";
return false;
}
return true;
}
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p, int arg) {
if (njob_ >= maxjob_) {
if (!GrowStack())
return;
}
int op = prog_->inst(id)->opcode();
if (op == kInstFail)
return;
// Only check ShouldVisit when arg == 0.
// When arg > 0, we are continuing a previous visit.
if (arg == 0 && !ShouldVisit(id, p))
return;
Job* j = &job_[njob_++];
j->id = id;
j->p = p;
j->arg = arg;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
const char* p = job_[njob_].p;
int arg = job_[njob_].arg;
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
if (0) {
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
}
// Visit ip, p.
// VLOG(0) << "Job: " << ip->id() << " "
// << (p - text_.begin()) << " " << arg;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
case kInstFail:
return false;
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
return false;
case kInstAlt:
// Cannot just
// Push(ip->out1(), p, 0);
// Push(ip->out(), p, 0);
// If, during the processing of ip->out(), we encounter
// ip->out1() via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// ip with arg==1 as a reminder to push ip->out1() later.
switch (arg) {
case 0:
Push(id, p, 1); // come back when we're done
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); try ip->out1().
arg = 0;
id = ip->out1();
goto CheckAndLoop;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstAltMatch:
// One opcode is byte range; the other leads to match.
if (ip->greedy(prog_)) {
// out1 is the match
Push(ip->out1(), p, 0);
id = ip->out1();
p = end;
goto CheckAndLoop;
}
// out is the match - non-greedy
Push(ip->out(), end, 0);
id = ip->out();
goto CheckAndLoop;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (ip->Matches(c)) {
id = ip->out();
p++;
goto CheckAndLoop;
}
continue;
}
case kInstCapture:
switch (arg) {
case 0:
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
// Continue on.
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
continue;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
continue;
id = ip->out();
goto CheckAndLoop;
case kInstNop:
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
continue;
// VLOG(0) << "Found match.";
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == text_.end())
return true;
// Otherwise, continue on in hope of a longer match.
continue;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
if (prog_->anchor_end() && context_.end() != text.end())
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = NULL;
// Allocate scratch space.
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// VLOG(0) << "nvisited_ = " << nvisited_;
ncap_ = 2*nsubmatch;
if (ncap_ < 2)
ncap_ = 2;
cap_ = new const char*[ncap_];
memset(cap_, 0, ncap_*sizeof cap_[0]);
maxjob_ = 256;
job_ = new Job[maxjob_];
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,107 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string>
#include "util/util.h"
#include "re2/filtered_re2.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
}
delete re;
} else {
*id = re2_vec_.size();
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(vector<string>* atoms) {
if (compiled_ || re2_vec_.size() == 0) {
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
return;
}
for (size_t i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return i;
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile";
return -1;
}
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const {
matching_regexps->clear();
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::AllPotentials(
const vector<int>& atoms,
vector<int>* potential_regexps) const {
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
}
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

View File

@@ -0,0 +1,109 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
#include <vector>
#include "re2.h"
namespace re2 {
using std::vector;
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(vector<string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const;
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
// If a regexp passes the filter it may still not match.
// A regexp that does not pass the filter is guaranteed to not match.
void AllPotentials(const vector<int>& atoms,
vector<int>* potential_regexps) const;
// The number of regexps added.
int NumRegexps() const { return re2_vec_.size(); }
private:
// Get the individual RE2 objects. Useful for testing.
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
//DISALLOW_COPY_AND_ASSIGN(FilteredRE2);
FilteredRE2(const FilteredRE2&);
void operator=(const FilteredRE2&);
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

View File

@@ -0,0 +1,185 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
private:
DISALLOW_COPY_AND_ASSIGN(EmptyStringWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

View File

@@ -0,0 +1,757 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
namespace re2 {
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
static const int Debug = 0;
private:
struct Thread {
union {
int id;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
int j;
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
AddState()
: id(0), j(-1), cap_j(NULL) {}
explicit AddState(int id)
: id(id), j(-1), cap_j(NULL) {}
AddState(int id, const char* cap_j, int j)
: id(id), j(j), cap_j(cap_j) {}
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline void FreeThread(Thread*);
// Add id (or its children, following unlabeled arrows)
// to the workqueue q with associated capture info.
void AddToThreadq(Threadq* q, int id, int flag,
const char* p, const char** capture);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is position of the next byte (the one after c)
// in the input string, used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
const char** match_; // best match so far
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
int first_byte_; // required first byte for match, or -1 if none
Thread* free_threads_; // free list
DISALLOW_COPY_AND_ASSIGN(NFA);
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
nastack_ = 2*prog_->size();
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
delete[] match_;
delete[] astack_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
void NFA::FreeThread(Thread *t) {
if (t == NULL)
return;
t->next = free_threads_;
free_threads_ = t;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
return t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// The pointer p is the current input position, and m is the
// current set of match boundaries.
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
const char* p, const char** capture) {
if (id0 == 0)
return;
// Astack_ is pre-allocated to avoid resize operations.
// It has room for 2*prog_->size() entries, which is enough:
// Each inst in prog can be processed at most once,
// pushing at most two entries on stk.
int nstk = 0;
AddState* stk = astack_;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
const AddState& a = stk[--nstk];
if (a.j >= 0)
capture[a.j] = a.cap_j;
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->find(id)->second;
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
// fall through
case kInstAlt:
// Explore alternatives.
stk[nstk++] = AddState(ip->out1());
stk[nstk++] = AddState(ip->out());
break;
case kInstNop:
// Continue on.
stk[nstk++] = AddState(ip->out());
break;
case kInstCapture:
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore capture[j]
// once we finish exploring this possibility.
stk[nstk++] = AddState(0, capture[j], j);
// Record capture.
capture[j] = p;
}
stk[nstk++] = AddState(ip->out());
break;
case kInstMatch:
case kInstByteRange:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
if (Debug)
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
break;
case kInstEmptyWidth:
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
stk[nstk++] = AddState(ip->out());
break;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates match as new, better matches are found.
// p is position of the byte c in the input string,
// used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
FreeThread(t);
continue;
}
}
int id = t->id;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
if (ip->Matches(c))
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture((const char**)match_, t->capture);
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch:
if (endmatch_ && p != etext_)
break;
const char* old = t->capture[1]; // previous end pointer
t->capture[1] = p;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
CopyCapture((const char**)match_, t->capture);
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture((const char**)match_, t->capture);
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
t->capture[0] = old;
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
return 0;
}
t->capture[0] = old;
matched_ = true;
break;
}
FreeThread(t);
}
runq->clear();
return 0;
}
string NFA::FormatCapture(const char** capture) {
string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
StringAppendF(&s, "(?,?)");
else if (capture[i+1] == NULL)
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
else
StringAppendF(&s, "(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
// Returns whether haystack contains needle's memory.
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
return haystack.begin() <= needle.begin() &&
haystack.end() >= needle.end();
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (!StringPieceContains(context, text)) {
LOG(FATAL) << "Bad args: context does not contain text "
<< reinterpret_cast<const void*>(context.begin())
<< "+" << context.size() << " "
<< reinterpret_cast<const void*>(text.begin())
<< "+" << text.size();
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
memset(match_, 0, ncapture_*sizeof match_[0]);
// For debugging prints.
btext_ = context.begin();
if (Debug) {
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.as_string().c_str(), context.as_string().c_str(), anchored,
longest);
}
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
const char* bp = context.begin();
int c = -1;
int wasword = 0;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
wasword = Prog::IsWordChar(c);
}
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
// Check for empty-width specials.
int flag = 0;
// ^ and \A
if (p == context.begin())
flag |= kEmptyBeginText | kEmptyBeginLine;
else if (p <= context.end() && p[-1] == '\n')
flag |= kEmptyBeginLine;
// $ and \z
if (p == context.end())
flag |= kEmptyEndText | kEmptyEndLine;
else if (p < context.end() && p[0] == '\n')
flag |= kEmptyEndLine;
// \b and \B
int isword = 0;
if (p < context.end())
isword = Prog::IsWordChar(p[0] & 0xFF);
if (isword != wasword)
flag |= kEmptyWordBoundary;
else
flag |= kEmptyNonWordBoundary;
if (Debug) {
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", t->id,
FormatCapture((const char**)t->capture).c_str());
}
fprintf(stderr, "\n");
}
// Process previous character (waited until now to avoid
// repeating the flag computation above).
// This is a no-op the first time around the loop, because
// runq is empty.
int id = Step(runq, nextq, c, flag, p-1);
DCHECK_EQ(runq->size(), 0);
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
if (ip->cap() < ncapture_)
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
case kInstEmptyWidth:
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
break;
}
id = ip->out();
continue;
}
break;
}
break;
}
if (p > text.end())
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
} else {
isword = Prog::IsWordChar(p[0] & 0xFF);
}
flag = Prog::EmptyFlags(context, p);
}
// Steal match storage (cleared but unused as of yet)
// temporarily to hold match boundaries for new thread.
match_[0] = p;
AddToThreadq(runq, start_, flag, p, match_);
match_[0] = NULL;
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (Debug)
fprintf(stderr, "dead\n");
break;
}
if (p == text.end())
c = 0;
else
c = *p & 0xFF;
wasword = isword;
// Will run step(runq, nextq, c, ...) on next iteration. See above.
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
FreeThread(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
if (Debug)
fprintf(stderr, "match (%d,%d)\n",
static_cast<int>(match_[0] - btext_),
static_cast<int>(match_[1] - btext_));
return true;
}
VLOG(1) << "No matches found";
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int NFA::ComputeFirstByte() {
if (start_ == 0)
return -1;
int b = -1; // first byte, not yet computed
typedef SparseSet Workq;
Workq q(prog_->size());
q.insert(start_);
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAlt:
case kInstAltMatch:
// Explore alternatives.
if (ip->out())
q.insert(ip->out());
if (ip->out1())
q.insert(ip->out1());
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (NFA::Debug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
// For each instruction i in the program reachable from the start, compute the
// number of instructions reachable from i by following only empty transitions
// and record that count as fanout[i].
//
// fanout holds the results and is also the work queue for the outer iteration.
// reachable holds the reached nodes for the inner iteration.
void Prog::Fanout(SparseArray<int>* fanout) {
DCHECK_EQ(fanout->max_size(), size());
SparseSet reachable(size());
fanout->clear();
fanout->set_new(start(), 0);
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
int* count = &i->second;
reachable.clear();
reachable.insert(i->index());
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
Prog::Inst* ip = inst(*j);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
break;
case kInstByteRange:
(*count)++;
if (!fanout->has_index(ip->out())) {
fanout->set_new(ip->out(), 0);
}
break;
case kInstAlt:
case kInstAltMatch:
reachable.insert(ip->out1());
// fall through
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
reachable.insert(ip->out());
break;
case kInstMatch:
case kInstFail:
break;
}
}
}
}
} // namespace re2

View File

@@ -0,0 +1,610 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <string.h>
#include <map>
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
static const int Debug = 0;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32 matchcond; // conditions to match right now.
uint32 action[1];
};
// The uint32 conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32 kMatchWins = 1 << kEmptyShift;
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
kEmptyShift_disagrees_with_kEmptyAllFlags);
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
kMaxCap_disagrees_with_kMaxOnePassCapture);
}
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
uint32 satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32 cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Compute a node pointer.
// Basically (OneState*)(nodes + statesize*nodeindex)
// but the version with the C++ casts overflows 80 characters (and is ugly).
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(
const_cast<uint8*>(nodes + statesize*nodeindex));
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
if (anchor_end() && context.end() != text.end())
return false;
if (anchor_end())
kind = kFullMatch;
// State and act are marked volatile to
// keep the compiler from re-ordering the
// memory accesses walking over the NFA.
// This is worth about 5%.
volatile OneState* state = onepass_start_;
volatile uint8* nodes = onepass_nodes_;
volatile uint32 statesize = onepass_statesize_;
uint8* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32 nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32 matchcond = nextmatchcond;
uint32 cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32 nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32 matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32 cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_start_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + byte_inst_count_;
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int size = this->size();
InstCond *stack = new InstCond[size];
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
uint8* nodes = new uint8[maxnodes*statesize];
uint8* nodep = nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
nodep += statesize;
int nalloc = 1;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes, statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
switch (ip->opcode()) {
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
// Fall through.
case kInstAlt:
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
goto fail;
stack[nstack].id = ip->out1();
stack[nstack++].cond = cond;
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (Debug)
LOG(ERROR)
<< StringPrintf("Not OnePass: hit node limit %d > %d",
nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
nodep += statesize;
nodebyid[ip->out()] = nextindex;
nalloc++;
AddQ(&tovisit, ip->out());
}
if (matched)
cond |= kMatchWins;
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in byte class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
}
break;
}
case kInstCapture:
if (ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
goto QueueEmpty;
case kInstEmptyWidth:
cond |= ip->empty();
goto QueueEmpty;
case kInstNop:
QueueEmpty:
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
" %d -> %d\n",
*it, ip->out());
}
goto fail;
}
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstMatch:
if (matched) {
// (3) is violated
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
" from %d\n", *it);
}
goto fail;
}
matched = true;
node->matchcond = cond;
break;
case kInstFail:
break;
}
}
}
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
string dump = "prog dump:\n" + Dump() + "node dump\n";
map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
StringAppendF(&dump, "byte ranges:\n");
int i = 0;
for (int b = 0; b < bytemap_range_; b++) {
int lo = i;
while (bytemap_[i] == b)
i++;
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
}
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes, statesize, nodeindex);
string s;
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << dump;
}
// Overallocated earlier; cut down to actual size.
nodep = new uint8[nalloc*statesize];
memmove(nodep, nodes, nalloc*statesize);
delete[] nodes;
nodes = nodep;
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
onepass_nodes_ = nodes;
onepass_statesize_ = statesize;
dfa_mem_ -= nalloc*statesize;
delete[] stack;
delete[] nodebyid;
return true;
fail:
delete[] stack;
delete[] nodebyid;
delete[] nodes;
return false;
}
} // namespace re2

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,119 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
static const URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static const URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static const URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static const URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static const URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static const URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static const URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static const URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static const URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static const URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static const URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static const URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
};
const int num_posix_groups = 28;
} // namespace re2

View File

@@ -0,0 +1,709 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const int Trace = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
GLOBAL_MUTEX(alloc_id_mutex);
static int alloc_id = 100000; // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
GLOBAL_MUTEX_LOCK(alloc_id_mutex);
alloc_id_ = alloc_id++;
GLOBAL_MUTEX_UNLOCK(alloc_id_mutex);
VLOG(10) << "alloc_id: " << alloc_id_;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
VLOG(10) << "Deleted: " << alloc_id_;
if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->size() == 0) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (size_t i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(set<string> *ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching ab will already allow this regexp to be a
// candidate for match, so further matching abc is redundant.
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
// Increment j early so that we can erase the element it points to.
SSIter old_j = j;
++j;
if (old_j->find(*i) != string::npos)
ss->erase(old_j);
}
}
}
Prefilter* Prefilter::OrStrings(set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
or_prefilter = new Prefilter(NONE);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
}
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyChar();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
// Format Info as a string.
string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
set<string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
string Prefilter::Info::ToString() {
if (is_exact_) {
int n = 0;
string s;
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
// Add the strings from src to dst.
static void CopyIn(const set<string>& src, set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const set<string>& a,
const set<string>& b,
set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return string(buf, n);
}
static string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return string(&c, 1);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character).
Prefilter::Info* Prefilter::Info::AnyChar() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (Trace) {
VLOG(0) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
VLOG(0) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyChar();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
if (Trace) {
VLOG(0) << " = " << a->ToString();
}
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
DISALLOW_COPY_AND_ASSIGN(Walker);
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (Trace) {
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
}
bool latin1 = re->parse_flags() & Regexp::Latin1;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
VLOG(10) << "Alt: " << info->ToString();
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
// Claim nothing, except that it's not empty.
info = AnyChar();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
if (Trace) {
VLOG(0) << "BuildInfo " << re->ToString()
<< ": " << (info ? info->ToString() : "");
}
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
string Prefilter::DebugString() const {
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
string s = "";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
return s;
}
case OR: {
string s = "(";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

View File

@@ -0,0 +1,105 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
#include "util/util.h"
namespace re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
vector<Prefilter*>* subs() {
CHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
string DebugString() const;
private:
class Info;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const string& str);
static Prefilter* OrStrings(set<string>* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
// Used for debugging, helps in tracking memory leaks.
int alloc_id_;
DISALLOW_COPY_AND_ASSIGN(Prefilter);
};
} // namespace re2
#endif // RE2_PREFILTER_H_

View File

@@ -0,0 +1,401 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
#include "re2.h"
DEFINE_int32(filtered_re2_min_atom_len,
3,
"Strings less than this length are not stored as atoms");
namespace re2 {
PrefilterTree::PrefilterTree()
: compiled_(false) {
}
PrefilterTree::~PrefilterTree() {
for (size_t i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (size_t i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
// Functions used for adding and Compiling prefilters to the
// PrefilterTree.
static bool KeepPart(Prefilter* prefilter, int level) {
if (prefilter == NULL)
return false;
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepPart: "
<< prefilter->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return prefilter->atom().size() >=
static_cast<size_t>(FLAGS_filtered_re2_min_atom_len);
case Prefilter::AND: {
int j = 0;
vector<Prefilter*>* subs = prefilter->subs();
for (size_t i = 0; i < subs->size(); i++)
if (KeepPart((*subs)[i], level + 1))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (size_t i = 0; i < prefilter->subs()->size(); i++)
if (!KeepPart((*prefilter->subs())[i], level + 1))
return false;
return true;
}
}
void PrefilterTree::Add(Prefilter *f) {
if (compiled_) {
LOG(DFATAL) << "Add after Compile.";
return;
}
if (f != NULL && !KeepPart(f, 0)) {
delete f;
f = NULL;
}
prefilter_vec_.push_back(f);
}
void PrefilterTree::Compile(vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile after Compile.";
return;
}
// We do this check to support some legacy uses of
// PrefilterTree that call Compile before adding any regexps,
// and expect Compile not to have effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
AssignUniqueIds(atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (size_t i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it) {
have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
}
if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->first].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
PrintDebugInfo();
}
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
string node_string = NodeString(node);
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
if (iter == node_map_.end())
return NULL;
return (*iter).second;
}
static string Itoa(int n) {
char buf[100];
snprintf(buf, sizeof buf, "%d", n);
return string(buf);
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = Itoa(node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
s += ',';
s += Itoa((*node->subs())[i]->unique_id());
}
}
return s;
}
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(i);
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (size_t i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const vector<Prefilter*>& subs = *f->subs();
for (size_t j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
node_map_[NodeString(node)] = node;
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(node_map_.size());
// Create parent StdIntMap for the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new StdIntMap();
}
// Fill the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
set<int> uniq_child;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
uniq_child.insert(child_id);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) ==
child_entry->parents->end()) {
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
}
entry->propagate_up_at_count =
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(i);
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const vector<int>& matched_atoms,
vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(WARNING) << "Compile() not called";
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(i);
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(prefilter_vec_.size());
vector<int> matched_atom_ids;
for (size_t j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(entries_.size());
IntMap work(entries_.size());
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
VLOG(10) << "Processing: " << it->index();
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++) {
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
regexps->set(entry.regexps[i], 1);
}
int c;
// Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->first;
const Entry& parent = entries_[j];
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
VLOG(10) << "Triggering: " << j;
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo() {
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
VLOG(10) << "#Unique Nodes: " << entries_.size();
for (size_t i = 0; i < entries_.size(); ++i) {
StdIntMap* parents = entries_[i].parents;
const vector<int>& regexps = entries_[i].regexps;
VLOG(10) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
VLOG(10) << it->first;
}
VLOG(10) << "Map:";
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
iter != node_map_.end(); ++iter)
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
string PrefilterTree::DebugNodeString(Prefilter* node) const {
string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
node_string += ',';
node_string += Itoa((*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

View File

@@ -0,0 +1,131 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
//
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
#include "util/util.h"
#include "util/sparse_array.h"
namespace re2 {
typedef SparseArray<int> IntMap;
typedef map<int, int> StdIntMap;
class Prefilter;
class PrefilterTree {
public:
PrefilterTree();
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(vector<string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
StdIntMap* parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
vector<int> regexps;
};
private:
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(vector<string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo();
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
vector<Entry> entries_;
// Map node string to canonical Prefilter node.
map<string, Prefilter*> node_map_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
vector<int> unfiltered_;
// vector of Prefilter for all regexps.
vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
DISALLOW_COPY_AND_ASSIGN(PrefilterTree);
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

View File

@@ -0,0 +1,343 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32 id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] -> %d",
foldcase_ ? "/i" : "",
lo_, hi_, out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
byte_inst_count_(0),
bytemap_range_(0),
flags_(0),
onepass_statesize_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
delete_dfa_(NULL),
unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
}
Prog::~Prog() {
if (delete_dfa_) {
if (dfa_first_)
delete_dfa_(dfa_first_);
if (dfa_longest_)
delete_dfa_(dfa_longest_);
}
delete[] onepass_nodes_;
delete[] inst_;
delete[] unbytemap_;
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
string Prog::Dump() {
string map;
if (false) { // Debugging
int lo = 0;
StringAppendF(&map, "byte map:\n");
for (int i = 0; i < bytemap_range_; i++) {
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
lo = unbytemap_[i] + 1;
}
StringAppendF(&map, "\n");
}
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
void Prog::MarkByteRange(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
if (0 < lo && lo <= 255)
byterange_.Set(lo - 1);
if (0 <= hi && hi <= 255)
byterange_.Set(hi);
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for prog_.
// Ranges of bytes that are treated as indistinguishable
// by the regexp program are mapped to a single byte class.
// The vector prog_->byterange() marks the end of each
// such range.
const Bitmap<256>& v = byterange();
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
if ((i&31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
}
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
for (int i = 0; i < 256; i++)
unbytemap_[bytemap_[i]] = i;
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
bytemap_[i] = i;
unbytemap_[i] = i;
}
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
}
} // namespace re2

View File

@@ -0,0 +1,381 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#ifndef RE2_PROG_H__
#define RE2_PROG_H__
#include "re2/util/util.h"
#include "re2/util/sparse_array.h"
#include "re2.h"
namespace re2 {
// Simple fixed-size bitmap.
template<int Bits>
class Bitmap {
public:
Bitmap() { Reset(); }
int Size() { return Bits; }
void Reset() {
for (int i = 0; i < Words; i++)
w_[i] = 0;
}
bool Get(int k) const {
return w_[k >> WordLog] & (1<<(k & 31));
}
void Set(int k) {
w_[k >> WordLog] |= 1<<(k & 31);
}
void Clear(int k) {
w_[k >> WordLog] &= ~(1<<(k & 31));
}
uint32 Word(int i) const {
return w_[i];
}
private:
static const int WordLog = 5;
static const int Words = (Bits+31)/32;
uint32 w_[Words];
DISALLOW_COPY_AND_ASSIGN(Bitmap);
};
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class Regexp;
class DFA;
struct OneState;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
Inst() : out_opcode_(0), out1_(0) { }
// Constructors per opcode
void InitAlt(uint32 out, uint32 out1);
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
void InitCapture(int cap, uint32 out);
void InitEmptyWidth(EmptyOp empty, uint32 out);
void InitMatch(int id);
void InitNop(uint32 out);
void InitFail();
// Getters
int id(Prog* p) { return this - p->inst_; }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int out() { return out_opcode_>>3; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog *p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange;
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase_ && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}
// Returns string representation for debugging.
string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_, and PatchList steals another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<3) | opcode;
}
void set_out(int out) {
out_opcode_ = (out<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<3) | opcode;
}
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
union { // additional instruction arguments:
uint32 out1_; // opcode == kInstAlt
// alternate next instruction
int32 cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32 match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8 lo_; // byte range is lo_-hi_ inclusive
uint8 hi_; //
uint8 foldcase_; // convert A-Z to a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
DISALLOW_COPY_AND_ASSIGN(Inst);
};
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int64 size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int64 byte_inst_count() { return byte_inst_count_; }
const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8* bytemap() { return bytemap_; }
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
// Record that at some point in the prog, the bytes in the range
// lo-hi (inclusive) are treated as different from bytes outside the range.
// Tracking this lets the DFA collapse commonly-treated byte ranges
// when recording state pointers, greatly reducing its memory footprint.
void MarkByteRange(int lo, int hi);
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32 EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8 c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match0, bool* failed,
vector<int>* matches);
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work. This function is useful only
// for testing purposes. Returns number of states.
int BuildEntireDFA(MatchKind kind);
// Compute byte map.
void ComputeByteMap();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the program size and the text size.
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int byte_inst_count_; // number of kInstByteRange instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int flags_; // regexp parse flags
int onepass_statesize_; // byte size of each OneState* node
Inst* inst_; // pointer to instruction array
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
void (*delete_dfa_)(DFA* dfa);
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
// commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
uint8* onepass_nodes_; // data for OnePass nodes
OneState* onepass_start_; // start node for OnePass program
DISALLOW_COPY_AND_ASSIGN(Prog);
};
} // namespace re2
#endif // RE2_PROG_H__

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,937 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression representation.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(op),
simple_(false),
parse_flags_(static_cast<uint16>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
subone_ = NULL;
memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
// Private: use Decref() instead of delete to destroy Regexps.
// Can't call Decref on the sub-Regexps here because
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
switch (op_) {
default:
break;
case kRegexpCapture:
delete name_;
break;
case kRegexpLiteralString:
delete[] runes_;
break;
case kRegexpCharClass:
if (cc_)
cc_->Delete();
delete ccb_;
break;
}
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
static map<Regexp*, int> *ref_map;
GLOBAL_MUTEX(ref_mutex);
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = 0;
if (ref_map != NULL) {
r = (*ref_map)[this];
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return r;
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
// Store ref count in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
if (ref_map == NULL) {
ref_map = new map<Regexp*, int>;
}
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
} else {
// overflowing now
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return this;
}
ref_++;
return this;
}
// Decrements reference count and deletes this object if count reaches 0.
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = r;
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return;
}
ref_--;
if (ref_ == 0)
Destroy();
}
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
if (QuickDestroy())
return;
// Handle recursive Destroy with explicit stack
// to avoid arbitrarily deep recursion on process stack [sigh].
down_ = NULL;
Regexp* stack = this;
while (stack != NULL) {
Regexp* re = stack;
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
if (re->nsub_ > 0) {
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
if (sub == NULL)
continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
--sub->ref_;
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
if (re->nsub_ > 1)
delete[] subs;
re->nsub_ = 0;
}
delete re;
}
}
void Regexp::AddRuneToString(Rune r) {
DCHECK(op_ == kRegexpLiteralString);
if (nrunes_ == 0) {
// start with 8
runes_ = new Rune[8];
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
// double on powers of two
Rune *old = runes_;
runes_ = new Rune[nrunes_ * 2];
for (int i = 0; i < nrunes_; i++)
runes_[i] = old[i];
delete[] old;
}
runes_[nrunes_++] = r;
}
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
re->match_id_ = match_id;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
if (nsub == 0) {
if (op == kRegexpAlternate)
return new Regexp(kRegexpNoMatch, flags);
else
return new Regexp(kRegexpEmptyMatch, flags);
}
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
Regexp* re = new Regexp(op, flags);
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(nsub);
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
}
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
}
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpLiteral, flags);
re->rune_ = rune;
return re;
}
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
if (nrunes <= 0)
return new Regexp(kRegexpEmptyMatch, flags);
if (nrunes == 1)
return NewLiteral(runes[0], flags);
Regexp* re = new Regexp(kRegexpLiteralString, flags);
for (int i = 0; i < nrunes; i++)
re->AddRuneToString(runes[i]);
return re;
}
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
re->cc_ = cc;
return re;
}
// Swaps this and that in place.
void Regexp::Swap(Regexp* that) {
// Can use memmove because Regexp is just a struct (no vtable).
char tmp[sizeof *this];
memmove(tmp, this, sizeof tmp);
memmove(this, that, sizeof tmp);
memmove(that, tmp, sizeof tmp);
}
// Tests equality of all top-level structure but not subregexps.
static bool TopEqual(Regexp* a, Regexp* b) {
if (a->op() != b->op())
return false;
switch (a->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpBeginText:
return true;
case kRegexpEndText:
// The parse flags remember whether it's \z or (?-m:$),
// which matters when testing against PCRE.
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
case kRegexpLiteral:
return a->rune() == b->rune() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
case kRegexpLiteralString:
return a->nrunes() == b->nrunes() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
memcmp(a->runes(), b->runes(),
a->nrunes() * sizeof a->runes()[0]) == 0;
case kRegexpAlternate:
case kRegexpConcat:
return a->nsub() == b->nsub();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
case kRegexpRepeat:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
a->min() == b->min() &&
a->max() == b->max();
case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
case kRegexpCharClass: {
CharClass* acc = a->cc();
CharClass* bcc = b->cc();
return acc->size() == bcc->size() &&
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
memcmp(acc->begin(), bcc->begin(),
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
}
}
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
return 0;
}
bool Regexp::Equal(Regexp* a, Regexp* b) {
if (a == NULL || b == NULL)
return a == b;
if (!TopEqual(a, b))
return false;
// Fast path:
// return without allocating vector if there are no subregexps.
switch (a->op()) {
case kRegexpAlternate:
case kRegexpConcat:
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
break;
default:
return true;
}
// Committed to doing real work.
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
Regexp* a2;
Regexp* b2;
switch (a->op()) {
default:
break;
case kRegexpAlternate:
case kRegexpConcat:
for (int i = 0; i < a->nsub(); i++) {
a2 = a->sub()[i];
b2 = b->sub()[i];
if (!TopEqual(a2, b2))
return false;
stk.push_back(a2);
stk.push_back(b2);
}
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
a2 = a->sub()[0];
b2 = b->sub()[0];
if (!TopEqual(a2, b2))
return false;
// Really:
// stk.push_back(a2);
// stk.push_back(b2);
// break;
// but faster to assign directly and loop.
a = a2;
b = b2;
continue;
}
int n = stk.size();
if (n == 0)
break;
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
}
return true;
}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const char *kErrorStrings[] = {
"no error",
"unexpected error",
"invalid escape sequence",
"invalid character class",
"invalid character class range",
"missing ]",
"missing )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
"bad repetition operator",
"invalid perl operator",
"invalid UTF-8",
"invalid named capture group",
};
string RegexpStatus::CodeText(enum RegexpStatusCode code) {
if (code < 0 || code >= arraysize(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
string RegexpStatus::Text() const {
if (error_arg_.empty())
return CodeText(code_);
string s;
s.append(CodeText(code_));
s.append(": ");
s.append(error_arg_.data(), error_arg_.size());
return s;
}
void RegexpStatus::Copy(const RegexpStatus& status) {
code_ = status.code_;
error_arg_ = status.error_arg_;
}
typedef int Ignored; // Walker<void> doesn't exist
// Walker subclass to count capturing parens in regexp.
class NumCapturesWalker : public Regexp::Walker<Ignored> {
public:
NumCapturesWalker() : ncapture_(0) {}
int ncapture() { return ncapture_; }
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture)
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
return ignored;
}
private:
int ncapture_;
DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker);
};
int Regexp::NumCaptures() {
NumCapturesWalker w;
w.Walk(this, 0);
return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
public:
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
map<string, int>* TakeMap() {
map<string, int>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
// multiple times, only the leftmost one counts.)
if (map_->find(*re->name()) == map_->end())
(*map_)[*re->name()] = re->cap();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
return ignored;
}
private:
map<string, int>* map_;
DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker);
};
map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Walker class to build map from capture group indices to their names.
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
public:
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
map<int, string>* TakeMap() {
map<int, string>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<int, string>;
(*map_)[re->cap()] = *re->name();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
return ignored;
}
private:
map<int, string>* map_;
DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker);
};
map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = re->runes_[j];
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = r;
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, re->rune_);
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase);
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = max<Rune>(lo, 'A');
Rune hi1 = min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = max<Rune>(lo, 'a');
hi1 = min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
{ // Check whether lo, hi is already in the class.
iterator it = ranges_.find(RuneRange(lo, lo));
if (it != end() && it->lo <= lo && hi <= it->hi)
return false;
}
// Look for a range abutting lo on the left.
// If it exists, take it out and increase our range.
if (lo > 0) {
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
if (it != end()) {
lo = it->lo;
if (it->hi > hi)
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for a range abutting hi on the right.
// If it exists, take it out and increase our range.
if (hi < Runemax) {
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
if (it != end()) {
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for ranges between lo and hi. Take them out.
// This is only safe because the set has no overlapping ranges.
// We've already removed any ranges abutting lo and hi, so
// any that overlap [lo, hi] must be contained within it.
for (;;) {
iterator it = ranges_.find(RuneRange(lo, hi));
if (it == end())
break;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
// Finally, add [lo, hi].
nrunes_ += hi - lo + 1;
ranges_.insert(RuneRange(lo, hi));
return true;
}
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
AddRange(it->lo, it->hi);
}
bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
CharClassBuilder* CharClassBuilder::Copy() {
CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
cc->lower_ = lower_;
cc->nrunes_ = nrunes_;
return cc;
}
void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
if (r < 'z') {
if (r < 'a')
lower_ = 0;
else
lower_ &= AlphaMask >> ('z' - r);
}
if (r < 'Z') {
if (r < 'A')
upper_ = 0;
else
upper_ &= AlphaMask >> ('Z' - r);
}
for (;;) {
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
RuneRange rr = *it;
ranges_.erase(it);
nrunes_ -= rr.hi - rr.lo + 1;
if (rr.lo <= r) {
rr.hi = r;
ranges_.insert(rr);
nrunes_ += rr.hi - rr.lo + 1;
}
}
}
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
// the current class begins at 0.
iterator it = begin();
if (it == end()) {
v.push_back(RuneRange(0, Runemax));
} else {
int nextlo = 0;
if (it->lo == 0) {
nextlo = it->hi + 1;
++it;
}
for (; it != end(); ++it) {
v.push_back(RuneRange(nextlo, it->lo - 1));
nextlo = it->hi + 1;
}
if (nextlo <= Runemax)
v.push_back(RuneRange(nextlo, Runemax));
}
ranges_.clear();
for (size_t i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
lower_ = AlphaMask & ~lower_;
nrunes_ = Runemax+1 - nrunes_;
}
// Character class is a sorted list of ranges.
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
cc->folds_ascii_ = false;
cc->nrunes_ = 0;
return cc;
}
void CharClass::Delete() {
uint8 *data = reinterpret_cast<uint8*>(this);
delete[] data;
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
int nextlo = 0;
for (CharClass::iterator it = begin(); it != end(); ++it) {
if (it->lo == nextlo) {
nextlo = it->hi + 1;
} else {
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
nextlo = it->hi + 1;
}
}
if (nextlo <= Runemax)
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
cc->nranges_ = n;
return cc;
}
bool CharClass::Contains(Rune r) {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
int m = n/2;
if (rr[m].hi < r) {
rr += m+1;
n -= m+1;
} else if (r < rr[m].lo) {
n = m;
} else { // rr[m].lo <= r && r <= rr[m].hi
return true;
}
}
return false;
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(ranges_.size());
size_t n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
DCHECK_LE(static_cast<size_t>(n), ranges_.size());
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;
}
} // namespace re2

View File

@@ -0,0 +1,633 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#ifndef RE2_REGEXP_H__
#define RE2_REGEXP_H__
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(enum RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
enum RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static string CodeText(enum RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
enum RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
DISALLOW_COPY_AND_ASSIGN(RegexpStatus);
};
// Walker to implement Simplify.
class SimplifyWalker;
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r);
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
DISALLOW_COPY_AND_ASSIGN(CharClass);
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_; }
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
map<string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
map<int, string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64 max_mem);
Prog* CompileToReverseProg(int64 max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
static int FactorAlternationRecursive(Regexp** sub, int nsub,
ParseFlags flags, int maxdepth);
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
if (n < 0 || static_cast<uint16>(n) != n)
LOG(FATAL) << "Cannot AllocSub " << n;
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = n;
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8 instead of RegexpOp to control space usage.
uint8 op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8 instead of bool to control space usage.
uint8 simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16 instead of ParseFlags to control space usage.
uint16 parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16 to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (100),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16 ref_;
static const uint16 kMaxRef = 0xffff;
// Subexpressions.
// uint16 to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16 nsub_;
static const uint16 kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
};
struct { // Capture
int cap_;
string* name_;
};
struct { // LiteralString
int nrunes_;
Rune* runes_;
};
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
DISALLOW_COPY_AND_ASSIGN(Regexp);
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32 AlphaMask = (1<<26) - 1;
uint32 upper_; // bitmap of A-Z
uint32 lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
DISALLOW_COPY_AND_ASSIGN(CharClassBuilder);
};
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
{
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
}
} // namespace re2
#endif // RE2_REGEXP_H__

View File

@@ -0,0 +1,113 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include "util/util.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2.h"
#include "re2/regexp.h"
using namespace re2;
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
}
RE2::Set::~Set() {
for (size_t i = 0; i < re_.size(); i++)
re_[i]->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add after Compile";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = re_.size();
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = re2::Regexp::Concat(sub, nsub + 1, pf);
delete[] sub;
} else {
re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
re_.push_back(re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile multiple times";
return false;
}
compiled_ = true;
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
re_.size(), pf);
re_.clear();
re2::Regexp* sre = re->Simplify();
re->Decref();
re = sre;
if (re == NULL) {
if (options_.log_errors())
LOG(ERROR) << "Error simplifying during Compile.";
return false;
}
prog_ = Prog::CompileSet(options_, anchor_, re);
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match without Compile";
return false;
}
v->clear();
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
if (failed)
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
if (ret == false)
return false;
if (v->size() == 0) {
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
return false;
}
return true;
}

View File

@@ -0,0 +1,55 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H
#define RE2_SET_H
#include <utility>
#include <vector>
#include "re2.h"
namespace re2 {
using std::vector;
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Add adds regexp pattern to the set, interpreted using the RE2 options.
// (The RE2 constructor's default options parameter is RE2::UTF8.)
// Add returns the regexp index that will be used to identify
// it in the result of Match, or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Error returns do not increment the index.
// If an error occurs and error != NULL, *error will hold an error message.
int Add(const StringPiece& pattern, string* error);
// Compile prepares the Set for matching.
// Add must not be called again after Compile.
// Compile must be called before FullMatch or PartialMatch.
// Compile may return false if it runs out of memory.
bool Compile();
// Match returns true if text matches any of the regexps in the set.
// If so, it fills v with the indices of the matching regexps.
bool Match(const StringPiece& text, vector<int>* v) const;
private:
RE2::Options options_;
RE2::Anchor anchor_;
vector<re2::Regexp*> re_;
re2::Prog* prog_;
bool compiled_;
//DISALLOW_COPY_AND_ASSIGN(Set);
Set(const Set&);
void operator=(const Set&);
};
} // namespace re2
#endif // RE2_SET_H

View File

@@ -0,0 +1,392 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple_)
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple_;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple_)
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
if (simple_)
return Incref();
SimplifyWalker w;
return w.Walk(this, NULL);
}
#define Simplify DontCallSimplify // Avoid accidental recursion
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple_) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
// Two passes to avoid allocation in the common case.
bool changed = false;
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
Regexp* newsub = child_args[i];
if (newsub != sub) {
changed = true;
break;
}
}
if (!changed) {
for (int i = 0; i < re->nsub_; i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub_);
Regexp** nre_subs = nre->sub();
for (int i = 0; i <re->nsub_; i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap_;
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return nre;
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

View File

@@ -0,0 +1,91 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/stringpiece.h"
#include "util/util.h"
using re2::StringPiece;
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
o.write(piece.data(), piece.size());
return o;
}
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
int len = x.size();
if (len != y.size()) {
return false;
}
const char* p = x.data();
const char* p2 = y.data();
// Test last byte in case strings share large common prefix
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
const char* p_limit = p + len;
for (; p < p_limit; p++, p2++) {
if (*p != *p2)
return false;
}
return true;
}
void StringPiece::CopyToString(string* target) const {
target->assign(ptr_, length_);
}
int StringPiece::copy(char* buf, size_type n, size_type pos) const {
int ret = min(length_ - pos, n);
memcpy(buf, ptr_ + pos, ret);
return ret;
}
bool StringPiece::contains(StringPiece s) const {
return (size_t)find(s, 0) != npos;
}
int StringPiece::find(const StringPiece& s, size_type pos) const {
if (length_ < 0 || pos > static_cast<size_type>(length_))
return npos;
const char* result = std::search(ptr_ + pos, ptr_ + length_,
s.ptr_, s.ptr_ + s.length_);
const size_type xpos = result - ptr_;
return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos;
}
int StringPiece::find(char c, size_type pos) const {
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
return npos;
}
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
return result != ptr_ + length_ ? result - ptr_ : npos;
}
int StringPiece::rfind(const StringPiece& s, size_type pos) const {
if (length_ < s.length_) return npos;
const size_t ulen = length_;
if (s.length_ == 0) return min(ulen, pos);
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
return result != last ? result - ptr_ : npos;
}
int StringPiece::rfind(char c, size_type pos) const {
if (length_ <= 0) return npos;
for (int i = min(pos, static_cast<size_type>(length_ - 1));
i >= 0; --i) {
if (ptr_[i] == c) {
return i;
}
}
return npos;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > static_cast<size_type>(length_)) pos = static_cast<size_type>(length_);
if (n > length_ - pos) n = length_ - pos;
return StringPiece(ptr_ + pos, n);
}
const StringPiece::size_type StringPiece::npos = size_type(-1);

View File

@@ -0,0 +1,185 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#ifndef STRINGS_STRINGPIECE_H__
#define STRINGS_STRINGPIECE_H__
#include <string.h>
#include <algorithm>
#include <cstddef>
#include <iosfwd>
#include <string>
namespace re2 {
class StringPiece {
private:
const char* ptr_;
int length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) { }
StringPiece(const char* str)
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
int size() const { return length_; }
int length() const { return length_; }
bool empty() const { return length_ == 0; }
void clear() { ptr_ = NULL; length_ = 0; }
void set(const char* data, int len) { ptr_ = data; length_ = len; }
void set(const char* str) {
ptr_ = str;
if (str != NULL)
length_ = static_cast<int>(strlen(str));
else
length_ = 0;
}
void set(const void* data, int len) {
ptr_ = reinterpret_cast<const char*>(data);
length_ = len;
}
char operator[](int i) const { return ptr_[i]; }
void remove_prefix(int n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(int n) {
length_ -= n;
}
int compare(const StringPiece& x) const {
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
if (r == 0) {
if (length_ < x.length_) r = -1;
else if (length_ > x.length_) r = +1;
}
return r;
}
std::string as_string() const {
return std::string(data(), size());
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data(), size());
}
void CopyToString(std::string* target) const;
void AppendToString(std::string* target) const;
// Does "this" start with "x"
bool starts_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_, x.ptr_, x.length_) == 0));
}
// Does "this" end with "x"
bool ends_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
}
// standard STL container boilerplate
typedef char value_type;
typedef const char* pointer;
typedef const char& reference;
typedef const char& const_reference;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos;
typedef const char* const_iterator;
typedef const char* iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef std::reverse_iterator<iterator> reverse_iterator;
iterator begin() const { return ptr_; }
iterator end() const { return ptr_ + length_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(ptr_ + length_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(ptr_);
}
// STLS says return size_type, but Google says return int
int max_size() const { return length_; }
int capacity() const { return length_; }
int copy(char* buf, size_type n, size_type pos = 0) const;
bool contains(StringPiece s) const;
int find(const StringPiece& s, size_type pos = 0) const;
int find(char c, size_type pos = 0) const;
int rfind(const StringPiece& s, size_type pos = npos) const;
int rfind(char c, size_type pos = npos) const;
StringPiece substr(size_type pos, size_type n = npos) const;
static bool _equal(const StringPiece&, const StringPiece&);
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
return StringPiece::_equal(x, y);
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
const int r = memcmp(x.data(), y.data(),
std::min(x.size(), y.size()));
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
} // namespace re2
// allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
#endif // STRINGS_STRINGPIECE_H__

View File

@@ -0,0 +1,341 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
enum {
PrecAtom,
PrecUnary,
PrecConcat,
PrecAlternate,
PrecEmpty,
PrecParen,
PrecToplevel,
};
// Helper function. See description below.
static void AppendCCRange(string* t, Rune lo, Rune hi);
// Walker to generate string in s_.
// The arg pointers are actually integers giving the
// context precedence.
// The child_args are always NULL.
class ToStringWalker : public Regexp::Walker<int> {
public:
explicit ToStringWalker(string* t) : t_(t) {}
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args);
virtual int ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
private:
string* t_; // The string the walker appends to.
DISALLOW_COPY_AND_ASSIGN(ToStringWalker);
};
string Regexp::ToString() {
string t;
ToStringWalker w(&t);
w.WalkExponential(this, PrecToplevel, 100000);
if (w.stopped_early())
t += " [truncated]";
return t;
}
#define ToString DontCallToString // Avoid accidental recursion.
// Visits re before children are processed.
// Appends ( if needed and passes new precedence to children.
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
int prec = parent_arg;
int nprec = PrecAtom;
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
case kRegexpHaveMatch:
nprec = PrecAtom;
break;
case kRegexpConcat:
case kRegexpLiteralString:
if (prec < PrecConcat)
t_->append("(?:");
nprec = PrecConcat;
break;
case kRegexpAlternate:
if (prec < PrecAlternate)
t_->append("(?:");
nprec = PrecAlternate;
break;
case kRegexpCapture:
t_->append("(");
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
t_->append(">");
}
nprec = PrecParen;
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (prec < PrecUnary)
t_->append("(?:");
// The subprecedence here is PrecAtom instead of PrecUnary
// because PCRE treats two unary ops in a row as a parse error.
nprec = PrecAtom;
break;
}
return nprec;
}
static void AppendLiteral(string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, r);
} else if (foldcase && 'a' <= r && r <= 'z') {
if ('a' <= r && r <= 'z')
r += 'A' - 'a';
t->append(1, '[');
t->append(1, r);
t->append(1, r + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
}
}
// Visits re after children are processed.
// For childless regexps, all the work is done here.
// For regexps with children, append any unary suffixes or ).
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) {
int prec = parent_arg;
switch (re->op()) {
case kRegexpNoMatch:
// There's no simple symbol for "no match", but
// [^0-Runemax] excludes everything.
t_->append("[^\\x00-\\x{10ffff}]");
break;
case kRegexpEmptyMatch:
// Append (?:) to make empty string visible,
// unless this is already being parenthesized.
if (prec < PrecEmpty)
t_->append("(?:)");
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpConcat:
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpAlternate:
// Clumsy but workable: the children all appended |
// at the end of their strings, so just remove the last one.
if ((*t_)[t_->size()-1] == '|')
t_->erase(t_->size()-1);
else
LOG(DFATAL) << "Bad final char: " << t_;
if (prec < PrecAlternate)
t_->append(")");
break;
case kRegexpStar:
t_->append("*");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpPlus:
t_->append("+");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpQuest:
t_->append("?");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpRepeat:
if (re->max() == -1)
t_->append(StringPrintf("{%d,}", re->min()));
else if (re->min() == re->max())
t_->append(StringPrintf("{%d}", re->min()));
else
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpAnyChar:
t_->append(".");
break;
case kRegexpAnyByte:
t_->append("\\C");
break;
case kRegexpBeginLine:
t_->append("^");
break;
case kRegexpEndLine:
t_->append("$");
break;
case kRegexpBeginText:
t_->append("(?-m:^)");
break;
case kRegexpEndText:
if (re->parse_flags() & Regexp::WasDollar)
t_->append("(?-m:$)");
else
t_->append("\\z");
break;
case kRegexpWordBoundary:
t_->append("\\b");
break;
case kRegexpNoWordBoundary:
t_->append("\\B");
break;
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
t_->append("[");
// Heuristic: show class as negated if it contains the
// non-character 0xFFFE.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE)) {
cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
cc->Delete();
t_->append("]");
break;
}
case kRegexpCapture:
t_->append(")");
break;
case kRegexpHaveMatch:
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
t_->append("(?HaveMatch:%d)", re->match_id());
break;
}
// If the parent is an alternation, append the | for it.
if (prec == PrecAlternate)
t_->append("|");
return 0;
}
// Appends a rune for use in a character class to the string t.
static void AppendCCChar(string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, r);
return;
}
switch (r) {
default:
break;
case '\r':
t->append("\\r");
return;
case '\t':
t->append("\\t");
return;
case '\n':
t->append("\\n");
return;
case '\f':
t->append("\\f");
return;
}
if (r < 0x100) {
StringAppendF(t, "\\x%02x", static_cast<int>(r));
return;
}
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(string* t, Rune lo, Rune hi) {
if (lo > hi)
return;
AppendCCChar(t, lo);
if (lo < hi) {
t->append("-");
AppendCCChar(t, hi);
}
}
} // namespace re2

View File

@@ -0,0 +1,480 @@
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
// 1034 groups, 2089 pairs, 289 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
{ 107, 107, 8383 },
{ 108, 114, -32 },
{ 115, 115, 268 },
{ 116, 122, -32 },
{ 181, 181, 743 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 223, 223, 7615 },
{ 224, 228, -32 },
{ 229, 229, 8262 },
{ 230, 246, -32 },
{ 248, 254, -32 },
{ 255, 255, 121 },
{ 256, 303, EvenOdd },
{ 306, 311, EvenOdd },
{ 313, 328, OddEven },
{ 330, 375, EvenOdd },
{ 376, 376, -121 },
{ 377, 382, OddEven },
{ 383, 383, -300 },
{ 384, 384, 195 },
{ 385, 385, 210 },
{ 386, 389, EvenOdd },
{ 390, 390, 206 },
{ 391, 392, OddEven },
{ 393, 394, 205 },
{ 395, 396, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 402, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 405, 405, 97 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 409, EvenOdd },
{ 410, 410, 163 },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 414, 414, 130 },
{ 415, 415, 214 },
{ 416, 421, EvenOdd },
{ 422, 422, 218 },
{ 423, 424, OddEven },
{ 425, 425, 218 },
{ 428, 429, EvenOdd },
{ 430, 430, 218 },
{ 431, 432, OddEven },
{ 433, 434, 217 },
{ 435, 438, OddEven },
{ 439, 439, 219 },
{ 440, 441, EvenOdd },
{ 444, 445, EvenOdd },
{ 447, 447, 56 },
{ 452, 452, EvenOdd },
{ 453, 453, OddEven },
{ 454, 454, -2 },
{ 455, 455, OddEven },
{ 456, 456, EvenOdd },
{ 457, 457, -2 },
{ 458, 458, EvenOdd },
{ 459, 459, OddEven },
{ 460, 460, -2 },
{ 461, 476, OddEven },
{ 477, 477, -79 },
{ 478, 495, EvenOdd },
{ 497, 497, OddEven },
{ 498, 498, EvenOdd },
{ 499, 499, -2 },
{ 500, 501, EvenOdd },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 543, EvenOdd },
{ 544, 544, -130 },
{ 546, 563, EvenOdd },
{ 570, 570, 10795 },
{ 571, 572, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 575, 576, 10815 },
{ 577, 578, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 591, EvenOdd },
{ 592, 592, 10783 },
{ 593, 593, 10780 },
{ 594, 594, 10782 },
{ 595, 595, -210 },
{ 596, 596, -206 },
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 608, 608, -205 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 619, 619, 10743 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
{ 629, 629, -214 },
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 643, 643, -218 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
{ 949, 949, 64 },
{ 950, 951, -32 },
{ 952, 952, 25 },
{ 953, 953, 7173 },
{ 954, 954, 54 },
{ 955, 955, -32 },
{ 956, 956, -775 },
{ 957, 959, -32 },
{ 960, 960, 22 },
{ 961, 961, 48 },
{ 962, 962, EvenOdd },
{ 963, 965, -32 },
{ 966, 966, 15 },
{ 967, 968, -32 },
{ 969, 969, 7517 },
{ 970, 971, -32 },
{ 972, 972, -64 },
{ 973, 974, -63 },
{ 975, 975, 8 },
{ 976, 976, -62 },
{ 977, 977, 35 },
{ 981, 981, -47 },
{ 982, 982, -54 },
{ 983, 983, -8 },
{ 984, 1007, EvenOdd },
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1019, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1319, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7680, 7776, EvenOdd },
{ 7777, 7777, 58 },
{ 7778, 7829, EvenOdd },
{ 7835, 7835, -59 },
{ 7838, 7838, -7615 },
{ 7840, 7935, EvenOdd },
{ 7936, 7943, 8 },
{ 7944, 7951, -8 },
{ 7952, 7957, 8 },
{ 7960, 7965, -8 },
{ 7968, 7975, 8 },
{ 7976, 7983, -8 },
{ 7984, 7991, 8 },
{ 7992, 7999, -8 },
{ 8000, 8005, 8 },
{ 8008, 8013, -8 },
{ 8017, 8017, 8 },
{ 8019, 8019, 8 },
{ 8021, 8021, 8 },
{ 8023, 8023, 8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8032, 8039, 8 },
{ 8040, 8047, -8 },
{ 8048, 8049, 74 },
{ 8050, 8053, 86 },
{ 8054, 8055, 100 },
{ 8056, 8057, 128 },
{ 8058, 8059, 112 },
{ 8060, 8061, 126 },
{ 8064, 8071, 8 },
{ 8072, 8079, -8 },
{ 8080, 8087, 8 },
{ 8088, 8095, -8 },
{ 8096, 8103, 8 },
{ 8104, 8111, -8 },
{ 8112, 8113, 8 },
{ 8115, 8115, 9 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7289 },
{ 8131, 8131, 9 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8179, 8179, 9 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7549 },
{ 8490, 8490, -8415 },
{ 8491, 8491, -8294 },
{ 8498, 8498, 28 },
{ 8526, 8526, -28 },
{ 8544, 8559, 16 },
{ 8560, 8575, -16 },
{ 8579, 8580, OddEven },
{ 9398, 9423, 26 },
{ 9424, 9449, -26 },
{ 11264, 11310, 48 },
{ 11312, 11358, -48 },
{ 11360, 11361, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11365, 11365, -10795 },
{ 11366, 11366, -10792 },
{ 11367, 11372, OddEven },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11379, EvenOdd },
{ 11381, 11382, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11491, EvenOdd },
{ 11499, 11502, OddEven },
{ 11506, 11507, EvenOdd },
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
{ 42560, 42605, EvenOdd },
{ 42624, 42647, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
{ 42877, 42877, -35332 },
{ 42878, 42887, EvenOdd },
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42899, EvenOdd },
{ 42912, 42921, EvenOdd },
{ 42922, 42922, -42308 },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
};
const int num_unicode_casefold = 289;
// 1034 groups, 1055 pairs, 167 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 256, 302, EvenOddSkip },
{ 306, 310, EvenOddSkip },
{ 313, 327, OddEvenSkip },
{ 330, 374, EvenOddSkip },
{ 376, 376, -121 },
{ 377, 381, OddEvenSkip },
{ 383, 383, -268 },
{ 385, 385, 210 },
{ 386, 388, EvenOddSkip },
{ 390, 390, 206 },
{ 391, 391, OddEven },
{ 393, 394, 205 },
{ 395, 395, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 401, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 408, EvenOdd },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 415, 415, 214 },
{ 416, 420, EvenOddSkip },
{ 422, 422, 218 },
{ 423, 423, OddEven },
{ 425, 425, 218 },
{ 428, 428, EvenOdd },
{ 430, 430, 218 },
{ 431, 431, OddEven },
{ 433, 434, 217 },
{ 435, 437, OddEvenSkip },
{ 439, 439, 219 },
{ 440, 440, EvenOdd },
{ 444, 444, EvenOdd },
{ 452, 452, 2 },
{ 453, 453, OddEven },
{ 455, 455, 2 },
{ 456, 456, EvenOdd },
{ 458, 458, 2 },
{ 459, 475, OddEvenSkip },
{ 478, 494, EvenOddSkip },
{ 497, 497, 2 },
{ 498, 500, EvenOddSkip },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 542, EvenOddSkip },
{ 544, 544, -130 },
{ 546, 562, EvenOddSkip },
{ 570, 570, 10795 },
{ 571, 571, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 577, 577, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 590, EvenOddSkip },
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 939, 32 },
{ 962, 962, EvenOdd },
{ 975, 975, 8 },
{ 976, 976, -30 },
{ 977, 977, -25 },
{ 981, 981, -15 },
{ 982, 982, -22 },
{ 984, 1006, EvenOddSkip },
{ 1008, 1008, -54 },
{ 1009, 1009, -48 },
{ 1012, 1012, -60 },
{ 1013, 1013, -64 },
{ 1015, 1015, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1018, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1120, 1152, EvenOddSkip },
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1318, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
{ 7840, 7934, EvenOddSkip },
{ 7944, 7951, -8 },
{ 7960, 7965, -8 },
{ 7976, 7983, -8 },
{ 7992, 7999, -8 },
{ 8008, 8013, -8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8040, 8047, -8 },
{ 8072, 8079, -8 },
{ 8088, 8095, -8 },
{ 8104, 8111, -8 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7517 },
{ 8490, 8490, -8383 },
{ 8491, 8491, -8262 },
{ 8498, 8498, 28 },
{ 8544, 8559, 16 },
{ 8579, 8579, OddEven },
{ 9398, 9423, 26 },
{ 11264, 11310, 48 },
{ 11360, 11360, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11367, 11371, OddEvenSkip },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11378, EvenOdd },
{ 11381, 11381, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11490, EvenOddSkip },
{ 11499, 11501, OddEvenSkip },
{ 11506, 11506, EvenOdd },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42646, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
{ 42877, 42877, -35332 },
{ 42878, 42886, EvenOddSkip },
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42898, EvenOddSkip },
{ 42912, 42920, EvenOddSkip },
{ 42922, 42922, -42308 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
};
const int num_unicode_tolower = 167;
} // namespace re2

View File

@@ -0,0 +1,75 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
// to the next largest Unicode point with equivalent folding. The largest
// point wraps back to the first. For example, the tables map:
//
// 'A' -> 'a'
// 'a' -> 'A'
//
// 'K' -> 'k'
// 'k' -> '' (Kelvin symbol)
// '' -> 'K'
//
// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
// Most table entries look like the ones around them:
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
// Instead of listing all the pairs explicitly, we make a list of ranges
// and deltas, so that the table entries for 'A' through 'Z' can be represented
// as a single entry { 'A', 'Z', +32 }.
//
// In addition to blocks that map to each other (A-Z mapping to a-z)
// there are blocks of pairs that individually map to each other
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
// For those, the special delta value EvenOdd marks even/odd pairs
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
//
// In this form, the table has 274 entries, about 3kB. If we were to split
// the table into one for 16-bit codes and an overflow table for larger ones,
// we could get it down to about 1.5kB, but that's not worth the complexity.
//
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
#ifndef RE2_UNICODE_CASEFOLD_H__
#define RE2_UNICODE_CASEFOLD_H__
#include "util/util.h"
namespace re2 {
enum {
EvenOdd = 1,
OddEven = -1,
EvenOddSkip = 1<<30,
OddEvenSkip,
};
struct CaseFold {
Rune lo;
Rune hi;
int32 delta;
};
extern const CaseFold unicode_casefold[];
extern const int num_unicode_casefold;
extern const CaseFold unicode_tolower[];
extern const int num_unicode_tolower;
// Returns the CaseFold* in the tables that contains rune.
// If rune is not in the tables, returns the first CaseFold* after rune.
// If rune is larger than any value in the tables, returns NULL.
extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
// Returns the result of applying the fold f to the rune r.
extern Rune ApplyFold(const CaseFold *f, Rune r);
} // namespace re2
#endif // RE2_UNICODE_CASEFOLD_H__

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
// and ranges of 32-bit codes. It would be simpler
// to use only 32-bit ranges, but these tables are large
// enough to warrant extra care.
//
// Using just 32-bit ranges gives 27 kB of data.
// Adding 16-bit ranges gives 18 kB of data.
// Adding an extra table of 16-bit singletons would reduce
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
#ifndef RE2_UNICODE_GROUPS_H__
#define RE2_UNICODE_GROUPS_H__
#include "util/util.h"
namespace re2 {
struct URange16
{
uint16 lo;
uint16 hi;
};
struct URange32
{
Rune lo;
Rune hi;
};
struct UGroup
{
const char *name;
int sign; // +1 for [abc], -1 for [^abc]
const URange16 *r16;
int nr16;
const URange32 *r32;
int nr32;
};
// Named by property or script name (e.g., "Nd", "N", "Han").
// Negated groups are not included.
extern const UGroup unicode_groups[];
extern const int num_unicode_groups;
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
// Negated groups are included.
extern const UGroup posix_groups[];
extern const int num_posix_groups;
// Named by Perl name (e.g., "\\d", "\\D").
// Negated groups are included.
extern const UGroup perl_groups[];
extern const int num_perl_groups;
} // namespace re2
#endif // RE2_UNICODE_GROUPS_H__

View File

@@ -0,0 +1,164 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_ATOMICOPS_H__
#define RE2_UTIL_ATOMICOPS_H__
// The memory ordering constraints resemble the ones in C11.
// RELAXED - no memory ordering, just an atomic operation.
// CONSUME - data-dependent ordering.
// ACQUIRE - prevents memory accesses from hoisting above the operation.
// RELEASE - prevents memory accesses from sinking below the operation.
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if !defined(OS_NACL) && (__has_builtin(__atomic_load_n) || (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__ >= 40801))
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0)
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0)
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0)
#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED)
#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE)
#else // old compiler
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0)
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0)
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0)
#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0)
#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0)
// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier()
// are an implementation detail and must not be used in the rest of the code.
#if defined(__i386__)
static inline void WriteMemoryBarrier() {
int x;
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
:: "r" (&x));
}
#elif defined(__x86_64__)
// 64-bit implementations of memory barrier can be simpler, because
// "sfence" is guaranteed to exist.
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("sfence" : : : "memory");
}
#elif defined(__ppc__) || defined(__powerpc64__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("eieio" : : : "memory");
}
#elif defined(__alpha__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("wmb" : : : "memory");
}
#elif defined(__aarch64__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("dmb st" : : : "memory");
}
#elif defined(__arm__) && defined(__linux__)
// Linux on ARM puts a suitable memory barrier at a magic address for us to call.
static inline void WriteMemoryBarrier() {
((void(*)(void))0xffff0fa0)();
}
#elif defined(__windows__)
// Windows
inline void WriteMemoryBarrier() {
LONG x;
::InterlockedExchange(&x, 0);
}
#elif defined(OS_NACL)
// Native Client
inline void WriteMemoryBarrier() {
__sync_synchronize();
}
#elif defined(__mips__)
inline void WriteMemoryBarrier() {
__asm__ __volatile__("sync" : : : "memory");
}
#else
#include "util/mutex.h"
static inline void WriteMemoryBarrier() {
// Slight overkill, but good enough:
// any mutex implementation must have
// a read barrier after the lock operation and
// a write barrier before the unlock operation.
//
// It may be worthwhile to write architecture-specific
// barriers for the common platforms, as above, but
// this is a correct fallback.
re2::Mutex mu;
re2::MutexLock l(&mu);
}
#endif
// Alpha has very weak memory ordering. If relying on WriteBarriers, one must
// use read barriers for the readers too.
#if defined(__alpha__)
static inline void MaybeReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#else
static inline void MaybeReadMemoryBarrier() {}
#endif // __alpha__
// Read barrier for various targets.
#if defined(__aarch64__)
static inline void ReadMemoryBarrier() {
__asm__ __volatile__("dmb ld" : : : "memory");
}
#elif defined(__alpha__)
static inline void ReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#elif defined(__mips__)
inline void ReadMemoryBarrier() {
__asm__ __volatile__("sync" : : : "memory");
}
#else
static inline void ReadMemoryBarrier() {}
#endif
#endif // old compiler
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
#endif // RE2_UTIL_ATOMICOPS_H__

View File

@@ -0,0 +1,41 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_BENCHMARK_H__
#define RE2_UTIL_BENCHMARK_H__
namespace testing {
struct Benchmark {
const char* name;
void (*fn)(int);
void (*fnr)(int, int);
int lo;
int hi;
int threadlo;
int threadhi;
void Register();
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
};
} // namespace testing
void SetBenchmarkBytesProcessed(long long);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
void BenchmarkMemoryUsage();
void SetBenchmarkItemsProcessed(int);
int NumCPUs();
#define BENCHMARK(f) \
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
#define BENCHMARK_RANGE(f, lo, hi) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#endif // RE2_UTIL_BENCHMARK_H__

View File

@@ -0,0 +1,27 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Simplified version of Google's command line flags.
// Does not support parsing the command line.
// If you want to do that, see
// https://gflags.github.io/gflags/
#ifndef RE2_UTIL_FLAGS_H__
#define RE2_UTIL_FLAGS_H__
#define DEFINE_flag(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
#define DECLARE_flag(type, name) \
namespace re2 { extern type FLAGS_##name; }
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
#define DECLARE_bool(name) DECLARE_flag(bool, name)
#define DECLARE_int32(name) DECLARE_flag(int32, name)
#define DECLARE_string(name) DECLARE_flag(string, name)
#endif // RE2_UTIL_FLAGS_H__

View File

@@ -0,0 +1,231 @@
// Modified by Russ Cox to add "namespace re2".
// Also threw away all but hashword and hashword2.
// http://burtleburtle.net/bob/c/lookup3.c
/*
-------------------------------------------------------------------------------
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
These are functions for producing 32-bit hashes for hash table lookup.
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
are externally useful functions. Routines to test the hash are included
if SELF_TEST is defined. You can use this free for any purpose. It's in
the public domain. It has no warranty.
You probably want to use hashlittle(). hashlittle() and hashbig()
hash byte arrays. hashlittle() is is faster than hashbig() on
little-endian machines. Intel and AMD are little-endian machines.
On second thought, you probably want hashlittle2(), which is identical to
hashlittle() except it returns two 32-bit hashes for the price of one.
You could implement hashbig2() if you wanted but I haven't bothered here.
If you want to find a hash of, say, exactly 7 integers, do
a = i1; b = i2; c = i3;
mix(a,b,c);
a += i4; b += i5; c += i6;
mix(a,b,c);
a += i7;
final(a,b,c);
then use c as the hash value. If you have a variable length array of
4-byte integers to hash, use hashword(). If you have a byte array (like
a character string), use hashlittle(). If you have several byte arrays, or
a mix of things, see the comments above hashlittle().
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
then mix those integers. This is fast (you can do a lot more thorough
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
-------------------------------------------------------------------------------
*/
#include "re2/util/util.h"
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
/*
-------------------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.
This is reversible, so any information in (a,b,c) before mix() is
still in (a,b,c) after mix().
If four pairs of (a,b,c) inputs are run through mix(), or through
mix() in reverse, there are at least 32 bits of the output that
are sometimes the same for one pair and different for another pair.
This was tested for:
* pairs that differed by one bit, by two bits, in any combination
of top bits of (a,b,c), or in any combination of bottom bits of
(a,b,c).
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
is commonly produced by subtraction) look like a single 1-bit
difference.
* the base values were pseudorandom, all zero but one bit set, or
all zero plus a counter that starts at zero.
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
satisfy this are
4 6 8 16 19 4
9 15 3 18 27 15
14 9 3 7 17 3
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
for "differ" defined as + with a one-bit base and a two-bit delta. I
used http://burtleburtle.net/bob/hash/avalanche.html to choose
the operations, constants, and arrangements of the variables.
This does not achieve avalanche. There are input bits of (a,b,c)
that fail to affect some output bits of (a,b,c), especially of a. The
most thoroughly mixed value is c, but it doesn't really even achieve
avalanche in c.
This allows some parallelism. Read-after-writes are good at doubling
the number of bits affected, so the goal of mixing pulls in the opposite
direction as the goal of parallelism. I did what I could. Rotates
seem to cost as much as shifts on every machine I could lay my hands
on, and rotates are much kinder to the top and bottom bits, so I used
rotates.
-------------------------------------------------------------------------------
*/
#define mix(a,b,c) \
{ \
a -= c; a ^= rot(c, 4); c += b; \
b -= a; b ^= rot(a, 6); a += c; \
c -= b; c ^= rot(b, 8); b += a; \
a -= c; a ^= rot(c,16); c += b; \
b -= a; b ^= rot(a,19); a += c; \
c -= b; c ^= rot(b, 4); b += a; \
}
/*
-------------------------------------------------------------------------------
final -- final mixing of 3 32-bit values (a,b,c) into c
Pairs of (a,b,c) values differing in only a few bits will usually
produce values of c that look totally different. This was tested for
* pairs that differed by one bit, by two bits, in any combination
of top bits of (a,b,c), or in any combination of bottom bits of
(a,b,c).
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
is commonly produced by subtraction) look like a single 1-bit
difference.
* the base values were pseudorandom, all zero but one bit set, or
all zero plus a counter that starts at zero.
These constants passed:
14 11 25 16 4 14 24
12 14 25 16 4 14 24
and these came close:
4 8 15 26 3 22 24
10 8 15 26 3 22 24
11 8 15 26 3 22 24
-------------------------------------------------------------------------------
*/
#define final(a,b,c) \
{ \
c ^= b; c -= rot(b,14); \
a ^= c; a -= rot(c,11); \
b ^= a; b -= rot(a,25); \
c ^= b; c -= rot(b,16); \
a ^= c; a -= rot(c,4); \
b ^= a; b -= rot(a,14); \
c ^= b; c -= rot(b,24); \
}
namespace re2 {
/*
--------------------------------------------------------------------
This works on all machines. To be useful, it requires
-- that the key be an array of uint32_t's, and
-- that the length be the number of uint32_t's in the key
The function hashword() is identical to hashlittle() on little-endian
machines, and identical to hashbig() on big-endian machines,
except that the length has to be measured in uint32_ts rather than in
bytes. hashlittle() is more complicated than hashword() only because
hashlittle() has to dance around fitting the key bytes into registers.
--------------------------------------------------------------------
*/
uint32 hashword(
const uint32 *k, /* the key, an array of uint32_t values */
size_t length, /* the length of the key, in uint32_ts */
uint32 initval) /* the previous hash, or an arbitrary value */
{
uint32_t a,b,c;
/* Set up the internal state */
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
/*------------------------------------------------- handle most of the key */
while (length > 3)
{
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 3;
k += 3;
}
/*------------------------------------------- handle the last 3 uint32_t's */
switch(length) /* all the case statements fall through */
{
case 3 : c+=k[2];
case 2 : b+=k[1];
case 1 : a+=k[0];
final(a,b,c);
case 0: /* case 0: nothing left to add */
break;
}
/*------------------------------------------------------ report the result */
return c;
}
/*
--------------------------------------------------------------------
hashword2() -- same as hashword(), but take two seeds and return two
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
both be initialized with seeds. If you pass in (*pb)==0, the output
(*pc) will be the same as the return value from hashword().
--------------------------------------------------------------------
*/
void hashword2 (
const uint32 *k, /* the key, an array of uint32_t values */
size_t length, /* the length of the key, in uint32_ts */
uint32 *pc, /* IN: seed OUT: primary hash value */
uint32 *pb) /* IN: more seed OUT: secondary hash value */
{
uint32_t a,b,c;
/* Set up the internal state */
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
c += *pb;
/*------------------------------------------------- handle most of the key */
while (length > 3)
{
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 3;
k += 3;
}
/*------------------------------------------- handle the last 3 uint32_t's */
switch(length) /* all the case statements fall through */
{
case 3 : c+=k[2];
case 2 : b+=k[1];
case 1 : a+=k[0];
final(a,b,c);
case 0: /* case 0: nothing left to add */
break;
}
/*------------------------------------------------------ report the result */
*pc=c; *pb=b;
}
} // namespace re2

View File

@@ -0,0 +1,86 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Simplified version of Google's logging.
#ifndef RE2_UTIL_LOGGING_H__
#define RE2_UTIL_LOGGING_H__
#include <stdio.h> /* for fwrite */
#include <sstream>
// Debug-only checking.
#define DCHECK(condition) assert(condition)
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
#define DCHECK_NE(val1, val2) assert((val1) != (val2))
#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
#define DCHECK_LT(val1, val2) assert((val1) < (val2))
#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
#define DCHECK_GT(val1, val2) assert((val1) > (val2))
// Always-on checking
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
#define CHECK_LT(x, y) CHECK((x) < (y))
#define CHECK_GT(x, y) CHECK((x) > (y))
#define CHECK_LE(x, y) CHECK((x) <= (y))
#define CHECK_GE(x, y) CHECK((x) >= (y))
#define CHECK_EQ(x, y) CHECK((x) == (y))
#define CHECK_NE(x, y) CHECK((x) != (y))
#define LOG_INFO LogMessage(__FILE__, __LINE__)
#define LOG_ERROR LOG_INFO
#define LOG_WARNING LOG_INFO
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
#define LOG_QFATAL LOG_FATAL
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
#ifdef NDEBUG
#define DEBUG_MODE 0
#define LOG_DFATAL LOG_ERROR
#else
#define DEBUG_MODE 1
#define LOG_DFATAL LOG_FATAL
#endif
#define LOG(severity) LOG_ ## severity.stream()
class LogMessage {
public:
LogMessage(const char* file, int line) : flushed_(false) {
stream() << file << ":" << line << ": ";
}
void Flush() {
stream() << "\n";
string s = str_.str();
size_t n = s.size();
if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
flushed_ = true;
}
~LogMessage() {
if (!flushed_) {
Flush();
}
}
ostream& stream() { return str_; }
private:
bool flushed_;
std::ostringstream str_;
DISALLOW_COPY_AND_ASSIGN(LogMessage);
};
class LogMessageFatal : public LogMessage {
public:
LogMessageFatal(const char* file, int line)
: LogMessage(file, line) { }
~LogMessageFatal() {
Flush();
abort();
}
private:
DISALLOW_COPY_AND_ASSIGN(LogMessageFatal);
};
#endif // RE2_UTIL_LOGGING_H__

View File

@@ -0,0 +1,213 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
* A simple mutex wrapper, supporting locks and read-write locks.
* You should assume the locks are *not* re-entrant.
*/
#ifndef RE2_UTIL_MUTEX_H_
#define RE2_UTIL_MUTEX_H_
#include <stdlib.h>
namespace re2 {
#ifndef WIN32
#define HAVE_PTHREAD 1
#define HAVE_RWLOCK 1
#endif
#if defined(NO_THREADS)
typedef int MutexType; // to keep a lock-count
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
// Needed for pthread_rwlock_*. If it causes problems, you could take it
// out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
// *does* cause problems for FreeBSD, or MacOSX, but isn't needed
// for locking there.)
# ifdef __linux__
# undef _XOPEN_SOURCE
# define _XOPEN_SOURCE 500 // may be needed to get the rwlock calls
# endif
# include <pthread.h>
typedef pthread_rwlock_t MutexType;
#elif defined(HAVE_PTHREAD)
# include <pthread.h>
typedef pthread_mutex_t MutexType;
#elif defined(_WIN32)
# define WIN32_LEAN_AND_MEAN // We only need minimal includes
# ifdef GMUTEX_TRYLOCK
// We need Windows NT or later for TryEnterCriticalSection(). If you
// don't need that functionality, you can remove these _WIN32_WINNT
// lines, and change TryLock() to assert(0) or something.
# ifndef _WIN32_WINNT
# define _WIN32_WINNT 0x0400
# endif
# endif
# include <windows.h>
typedef CRITICAL_SECTION MutexType;
#else
# error Need to implement mutex.h for your architecture, or #define NO_THREADS
#endif
class Mutex {
public:
// Create a Mutex that is not held by anybody.
inline Mutex();
// Destructor
inline ~Mutex();
inline void Lock(); // Block if needed until free then acquire exclusively
inline void Unlock(); // Release a lock acquired via Lock()
inline bool TryLock(); // If free, Lock() and return true, else return false
// Note that on systems that don't support read-write locks, these may
// be implemented as synonyms to Lock() and Unlock(). So you can use
// these for efficiency, but don't use them anyplace where being able
// to do shared reads is necessary to avoid deadlock.
inline void ReaderLock(); // Block until free or shared then acquire a share
inline void ReaderUnlock(); // Release a read share of this Mutex
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
inline void AssertHeld() { }
private:
MutexType mutex_;
// Catch the error of writing Mutex when intending MutexLock.
Mutex(Mutex *ignored);
// Disallow "evil" constructors
Mutex(const Mutex&);
void operator=(const Mutex&);
};
// Now the implementation of Mutex for various systems
#if defined(NO_THREADS)
// When we don't have threads, we can be either reading or writing,
// but not both. We can have lots of readers at once (in no-threads
// mode, that's most likely to happen in recursive function calls),
// but only one writer. We represent this by having mutex_ be -1 when
// writing and a number > 0 when reading (and 0 when no lock is held).
//
// In debug mode, we assert these invariants, while in non-debug mode
// we do nothing, for efficiency. That's why everything is in an
// assert.
#include <assert.h>
Mutex::Mutex() : mutex_(0) { }
Mutex::~Mutex() { assert(mutex_ == 0); }
void Mutex::Lock() { assert(--mutex_ == -1); }
void Mutex::Unlock() { assert(mutex_++ == -1); }
bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; }
void Mutex::ReaderLock() { assert(++mutex_ > 0); }
void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
bool Mutex::TryLock() { return pthread_rwlock_trywrlock(&mutex_) == 0; }
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
#undef SAFE_PTHREAD
#elif defined(HAVE_PTHREAD)
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); }
Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy(&mutex_)); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock(&mutex_)); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock(&mutex_)); }
bool Mutex::TryLock() { return pthread_mutex_trylock(&mutex_) == 0; }
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
void Mutex::ReaderUnlock() { Unlock(); }
#undef SAFE_PTHREAD
#elif defined(_WIN32)
Mutex::Mutex() { InitializeCriticalSection(&mutex_); }
Mutex::~Mutex() { DeleteCriticalSection(&mutex_); }
void Mutex::Lock() { EnterCriticalSection(&mutex_); }
void Mutex::Unlock() { LeaveCriticalSection(&mutex_); }
bool Mutex::TryLock() { return TryEnterCriticalSection(&mutex_) != 0; }
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
void Mutex::ReaderUnlock() { Unlock(); }
#endif
// --------------------------------------------------------------------------
// Some helper classes
// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
class MutexLock {
public:
explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
~MutexLock() { mu_->Unlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
MutexLock(const MutexLock&);
void operator=(const MutexLock&);
};
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
class ReaderMutexLock {
public:
explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); }
~ReaderMutexLock() { mu_->ReaderUnlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
ReaderMutexLock(const ReaderMutexLock&);
void operator=(const ReaderMutexLock&);
};
class WriterMutexLock {
public:
explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); }
~WriterMutexLock() { mu_->WriterUnlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
WriterMutexLock(const WriterMutexLock&);
void operator=(const WriterMutexLock&);
};
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name)
#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
// Provide safe way to declare and use global, linker-initialized mutex. Sigh.
#ifdef HAVE_PTHREAD
#define GLOBAL_MUTEX(name) \
static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
#define GLOBAL_MUTEX_LOCK(name) \
pthread_mutex_lock(&(name))
#define GLOBAL_MUTEX_UNLOCK(name) \
pthread_mutex_unlock(&(name))
#else
#define GLOBAL_MUTEX(name) \
static Mutex name
#define GLOBAL_MUTEX_LOCK(name) \
name.Lock()
#define GLOBAL_MUTEX_UNLOCK(name) \
name.Unlock()
#endif
} // namespace re2
#endif /* #define RE2_UTIL_MUTEX_H_ */

View File

@@ -0,0 +1,679 @@
// Copyright 2003-2010 Google Inc. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
// The main changes are the addition of the HitLimit method and
// compilation as PCRE in namespace re2.
// C++ interface to the pcre regular-expression library. PCRE supports
// Perl-style regular expressions (with extensions like \d, \w, \s,
// ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the pcre library and hence supports its syntax
// for regular expressions:
//
// http://www.google.com/search?q=pcre
//
// The syntax is pretty similar to Perl's. For those not familiar
// with Perl's regular expressions, here are some examples of the most
// commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(PCRE::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!PCRE::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, pattern and text are plain text, one byte per character.
// The UTF8 flag, passed to the constructor, causes both pattern
// and string to be treated as UTF-8 text, still a byte stream but
// potentially multiple bytes per character. In practice, the text
// is likelier to be UTF-8 than the pattern, but the match returned
// may depend on the UTF8 flag, so always use it when matching
// UTF8 text. E.g., "." will match one byte normally but with UTF8
// set may match up to three bytes of a multi-byte character.
//
// Example:
// PCRE re(utf8_pattern, PCRE::UTF8);
// CHECK(PCRE::FullMatch(utf8_string, re));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(PCRE::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS
//
// PCRE makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "PCRE"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// PCRE pattern("h.*o");
// while (ReadLine(&str)) {
// if (PCRE::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCPCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// PCRE::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d));
// will leave 64 in a, b, c, and d.
#include "util/util.h"
#include "re2/stringpiece.h"
#ifdef USEPCRE
#include <pcre.h>
namespace re2 {
const bool UsingPCRE = true;
} // namespace re2
#else
namespace re2 {
const bool UsingPCRE = false;
struct pcre;
struct pcre_extra { int flags, match_limit, match_limit_recursion; };
#define pcre_free(x) {}
#define PCRE_EXTRA_MATCH_LIMIT 0
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
#define PCRE_ANCHORED 0
#define PCRE_NOTEMPTY 0
#define PCRE_ERROR_NOMATCH 1
#define PCRE_ERROR_MATCHLIMIT 2
#define PCRE_ERROR_RECURSIONLIMIT 3
#define PCRE_INFO_CAPTURECOUNT 0
#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); })
#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; })
#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; })
} // namespace re2
#endif
namespace re2 {
class PCRE_Options;
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "PCRE" object is safe for
// concurrent use by multiple threads.
class PCRE {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
// Marks end of arg list.
// ONLY USE IN OPTIONAL ARG DEFAULTS.
// DO NOT PASS EXPLICITLY.
static Arg no_more_args;
// Options are same value as those in pcre. We provide them here
// to avoid users needing to include pcre.h and also to isolate
// users from pcre should we change the underlying library.
// Only those needed by Google programs are exposed here to
// avoid collision with options employed internally by regexp.cc
// Note that some options have equivalents that can be specified in
// the regexp itself. For example, prefixing your regexp with
// "(?s)" has the same effect as the PCRE_DOTALL option.
enum Option {
None = 0x0000,
UTF8 = 0x0800, // == PCRE_UTF8
EnabledCompileOptions = UTF8,
EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag
};
// We provide implicit conversions from strings so that users can
// pass in a string or a "const char*" wherever an "PCRE" is expected.
PCRE(const char* pattern);
PCRE(const char* pattern, Option option);
PCRE(const string& pattern);
PCRE(const string& pattern, Option option);
PCRE(const char *pattern, const PCRE_Options& re_option);
PCRE(const string& pattern, const PCRE_Options& re_option);
~PCRE();
// The string specification for this PCRE. E.g.
// PCRE re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If PCRE could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// Whether the PCRE has hit a match limit during execution.
// Not thread safe. Intended only for testing.
// If hitting match limits is a problem,
// you should be using PCRE2 (re2/re2.h)
// instead of checking this flag.
bool HitLimit();
void ClearHitLimit();
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "PCRE" for "pattern".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
struct FullMatchFunctor {
bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const FullMatchFunctor FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
struct PartialMatchFunctor {
bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const PartialMatchFunctor PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
struct ConsumeFunctor {
bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const ConsumeFunctor Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
struct FindAndConsumeFunctor {
bool operator ()(StringPiece* input, const PCRE& pattern,
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const FindAndConsumeFunctor FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(PCRE::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const PCRE& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces all occurrences of the pattern in
// the string with the rewrite. Replacements are not subject to
// re-matching. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(PCRE::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const PCRE& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
static bool Extract(const StringPiece &text,
const PCRE& pattern,
const StringPiece &rewrite,
string *out);
// Check that the given @p rewrite string is suitable for use with
// this PCRE. It checks that:
// * The PCRE has enough parenthesized subexpressions to satisfy all
// of the \N tokens in @p rewrite, and
// * The @p rewrite string doesn't have any syntax errors
// ('\' followed by anything besides [0-9] and '\').
// Making this test will guarantee that "replace" and "extract"
// operations won't LOG(ERROR) or fail because of a bad rewrite
// string.
// @param rewrite The proposed rewrite string.
// @param error An error message is recorded here, iff we return false.
// Otherwise, it is unchanged.
// @return true, iff @p rewrite is suitable for use with the PCRE.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Returns a copy of 'unquoted' with all potentially meaningful
// regexp characters backslash-escaped. The returned string, used
// as a regular expression, will exactly match the original string.
// For example,
// 1.5-2.0?
// becomes:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
/***** Generic matching interface (not so nice to use) *****/
// Type of match (TODO: Should be restructured as an Option)
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH, // Anchor at start and end
};
// General matching routine. Stores the length of the match in
// "*consumed" if successful.
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
const Arg* const* args, int n) const;
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction.
int NumberOfCapturingGroups() const;
private:
void Init(const char* pattern, Option option, int match_limit,
int stack_limit, bool report_errors);
// Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
// pairs of integers for the beginning and end positions of matched
// text. The first pair corresponds to the entire matched text;
// subsequent pairs correspond, in order, to parentheses-captured
// matches. Returns the number of pairs (one more than the number of
// the last subpattern with a match) if matching was successful
// and zero if the match failed.
// I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching
// against "foo", "bar", and "baz" respectively.
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
// But the values for all subpattern are filled in into "vec".
int TryMatch(const StringPiece& text,
int startpos,
Anchor anchor,
bool empty_ok,
int *vec,
int vecsize) const;
// Append the "rewrite" string, with backslash subsitutions from "text"
// and "vec", to string "out".
bool Rewrite(string *out,
const StringPiece &rewrite,
const StringPiece &text,
int *vec,
int veclen) const;
// internal implementation for DoMatch
bool DoMatchImpl(const StringPiece& text,
Anchor anchor,
int* consumed,
const Arg* const args[],
int n,
int* vec,
int vecsize) const;
// Compile the regexp for the specified anchoring mode
pcre* Compile(Anchor anchor);
string pattern_;
Option options_;
pcre* re_full_; // For full matches
pcre* re_partial_; // For partial matches
const string* error_; // Error indicator (or empty string)
bool report_errors_; // Silences error logging if false
int match_limit_; // Limit on execution resources
int stack_limit_; // Limit on stack resources (bytes)
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
DISALLOW_COPY_AND_ASSIGN(PCRE);
};
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
// "extra" options. The only extras are match_limit, which limits
// the CPU time of a match, and stack_limit, which limits the
// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default
// that should not cause too many problems in production code.
// If PCRE hits a limit during a match, it may return a false negative,
// but (hopefully) it won't crash.
//
// NOTE: If you are handling regular expressions specified by
// (external or internal) users, rather than hard-coded ones,
// you should be using PCRE2, which uses an alternate implementation
// that avoids these issues. See http://go/re2quick.
class PCRE_Options {
public:
// constructor
PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {}
// accessors
PCRE::Option option() const { return option_; }
void set_option(PCRE::Option option) {
option_ = option;
}
int match_limit() const { return match_limit_; }
void set_match_limit(int match_limit) {
match_limit_ = match_limit;
}
int stack_limit() const { return stack_limit_; }
void set_stack_limit(int stack_limit) {
stack_limit_ = stack_limit;
}
// If the regular expression is malformed, an error message will be printed
// iff report_errors() is true. Default: true.
bool report_errors() const { return report_errors_; }
void set_report_errors(bool report_errors) {
report_errors_ = report_errors;
}
private:
PCRE::Option option_;
int match_limit_;
int stack_limit_;
bool report_errors_;
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _PCRE_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class PCRE::Arg {
public:
// Empty constructor so we can declare arrays of PCRE::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type,name) \
Arg(type* p) : arg_(p), parser_(name) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
MAKE_PARSER(char, parse_char);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
// Generic constructor template
template <class T> Arg(T* p)
: arg_(p), parser_(_PCRE_MatchObject<T>::Parse) {
}
// Parse the data
bool Parse(const char* str, int n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#undef DECLARE_INTEGER_PARSER
};
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool PCRE::Arg::Parse(const char* str, int n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline PCRE::Arg Hex(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \
inline PCRE::Arg Octal(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \
inline PCRE::Arg CRadix(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); }
MAKE_INTEGER_PARSER(short, short);
MAKE_INTEGER_PARSER(unsigned short, ushort);
MAKE_INTEGER_PARSER(int, int);
MAKE_INTEGER_PARSER(unsigned int, uint);
MAKE_INTEGER_PARSER(long, long);
MAKE_INTEGER_PARSER(unsigned long, ulong);
MAKE_INTEGER_PARSER(long long, longlong);
MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
#undef MAKE_INTEGER_PARSER
} // namespace re2

View File

@@ -0,0 +1,29 @@
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Modified from Google perftools's tcmalloc_unittest.cc.
#ifndef RE2_UTIL_RANDOM_H__
#define RE2_UTIL_RANDOM_H__
#include "util/util.h"
namespace re2 {
// ACM minimal standard random number generator. (re-entrant.)
class ACMRandom {
public:
ACMRandom(int32 seed) : seed_(seed) {}
int32 Next();
int32 Uniform(int32);
void Reset(int32 seed) { seed_ = seed; }
private:
int32 seed_;
};
} // namespace re2
#endif // RE2_UTIL_RANDOM_H__

View File

@@ -0,0 +1,258 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "re2/util/utf.h"
namespace re2 {
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror,
};
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
long l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = *(unsigned char*)str;
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = *(unsigned char*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(unsigned char*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(unsigned char*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
}
int
runetochar(char *str, const Rune *rune)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned long c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(Rune rune)
{
char str[10];
return runetochar(str, &rune);
}
int
fullrune(const char *str, int n)
{
if (n > 0) {
int c = *(unsigned char*)str;
if (c < Tx)
return 1;
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}
int
utflen(const char *s)
{
int c;
long n;
Rune rune;
n = 0;
for(;;) {
c = *(unsigned char*)s;
if(c < Runeself) {
if(c == 0)
return n;
s++;
} else
s += chartorune(&rune, s);
n++;
}
return 0;
}
char*
utfrune(const char *s, Rune c)
{
long c1;
Rune r;
int n;
if(c < Runesync) /* not part of utf sequence */
return strchr((char*)s, c);
for(;;) {
c1 = *(unsigned char*)s;
if(c1 < Runeself) { /* one byte rune */
if(c1 == 0)
return 0;
if(c1 == c)
return (char*)s;
s++;
continue;
}
n = chartorune(&r, s);
if(r == c)
return (char*)s;
s += n;
}
return 0;
}
} // namespace re2

View File

@@ -0,0 +1,453 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// DESCRIPTION
//
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
// fast iteration through the elements in the array and fast clearing
// of the array. The array has a concept of certain elements being
// uninitialized (having no value).
//
// Insertion and deletion are constant time operations.
//
// Allocating the array is a constant time operation
// when memory allocation is a constant time operation.
//
// Clearing the array is a constant time operation (unusual!).
//
// Iterating through the array is an O(n) operation, where n
// is the number of items in the array (not O(m)).
//
// The array iterator visits entries in the order they were first
// inserted into the array. It is safe to add items to the array while
// using an iterator: the iterator will visit indices added to the array
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseArray can be a convenient
// implementation of a work queue.
//
// The SparseArray implementation is NOT thread-safe. It is up to the
// caller to make sure only one thread is accessing the array. (Typically
// these arrays are temporary values and used in situations where speed is
// important.)
//
// The SparseArray interface does not present all the usual STL bells and
// whistles.
//
// Implemented with reference to Briggs & Torczon, An Efficient
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// Briggs & Torczon popularized this technique, but it had been known
// long before their paper. They point out that Aho, Hopcroft, and
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
// 1986 Programming Pearls both hint at the technique in exercises to the
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
// exercise 8).
//
// Briggs & Torczon describe a sparse set implementation. I have
// trivially generalized it to create a sparse array (actually the original
// target of the AHU and Bentley exercises).
// IMPLEMENTATION
//
// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
// size max_size_. At any point, the number of elements in the sparse array is
// size_.
//
// The vector dense_ contains the size_ elements in the sparse array (with
// their indices),
// in the order that the elements were first inserted. This array is dense:
// the size_ pairs are dense_[0] through dense_[size_-1].
//
// The array sparse_to_dense_ maps from indices in [0,m) to indices in
// [0,size_).
// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
// For indices not present in the array, sparse_to_dense_ can contain
// any value at all, perhaps outside the range [0, size_) but perhaps not.
//
// The lax requirement on sparse_to_dense_ values makes clearing
// the array very easy: set size_ to 0. Lookups are slightly more
// complicated. An index i has a value in the array if and only if:
// sparse_to_dense_[i] is in [0, size_) AND
// dense_[sparse_to_dense_[i]].index_ == i.
// If both these properties hold, only then it is safe to refer to
// dense_[sparse_to_dense_[i]].value_
// as the value associated with index i.
//
// To insert a new entry, set sparse_to_dense_[i] to size_,
// initialize dense_[size_], and then increment size_.
//
// Deletion of specific values from the array is implemented by
// swapping dense_[size_-1] and the dense_ being deleted and then
// updating the appropriate sparse_to_dense_ entries.
//
// To make the sparse array as efficient as possible for non-primitive types,
// elements may or may not be destroyed when they are deleted from the sparse
// array through a call to erase(), erase_existing() or resize(). They
// immediately become inaccessible, but they are only guaranteed to be
// destroyed when the SparseArray destructor is called.
#ifndef RE2_UTIL_SPARSE_ARRAY_H__
#define RE2_UTIL_SPARSE_ARRAY_H__
#include "re2/util/util.h"
namespace re2 {
template<typename Value>
class SparseArray {
public:
SparseArray();
SparseArray(int max_size);
~SparseArray();
// IndexValue pairs: exposed in SparseArray::iterator.
class IndexValue;
typedef IndexValue value_type;
typedef typename vector<IndexValue>::iterator iterator;
typedef typename vector<IndexValue>::const_iterator const_iterator;
inline const IndexValue& iv(int i) const;
// Return the number of entries in the array.
int size() const {
return size_;
}
// Iterate over the array.
iterator begin() {
return dense_.begin();
}
iterator end() {
return dense_.begin() + size_;
}
const_iterator begin() const {
return dense_.begin();
}
const_iterator end() const {
return dense_.begin() + size_;
}
// Change the maximum size of the array.
// Invalidates all iterators.
void resize(int max_size);
// Return the maximum size of the array.
// Indices can be in the range [0, max_size).
int max_size() const {
return max_size_;
}
// Clear the array.
void clear() {
size_ = 0;
}
// Check whether index i is in the array.
inline bool has_index(int i) const;
// Comparison function for sorting.
// Can sort the sparse array so that future iterations
// will visit indices in increasing order using
// sort(arr.begin(), arr.end(), arr.less);
static bool less(const IndexValue& a, const IndexValue& b);
public:
// Set the value at index i to v.
inline iterator set(int i, Value v);
pair<iterator, bool> insert(const value_type& new_value);
// Returns the value at index i
// or defaultv if index i is not initialized in the array.
inline Value get(int i, Value defaultv) const;
iterator find(int i);
const_iterator find(int i) const;
// Change the value at index i to v.
// Fast but unsafe: only use if has_index(i) is true.
inline iterator set_existing(int i, Value v);
// Set the value at the new index i to v.
// Fast but unsafe: only use if has_index(i) is false.
inline iterator set_new(int i, Value v);
// Get the value at index i from the array..
// Fast but unsafe: only use if has_index(i) is true.
inline Value get_existing(int i) const;
// Erasing items from the array during iteration is in general
// NOT safe. There is one special case, which is that the current
// index-value pair can be erased as long as the iterator is then
// checked for being at the end before being incremented.
// For example:
//
// for (i = m.begin(); i != m.end(); ++i) {
// if (ShouldErase(i->index(), i->value())) {
// m.erase(i->index());
// --i;
// }
// }
//
// Except in the specific case just described, elements must
// not be erased from the array (including clearing the array)
// while iterators are walking over the array. Otherwise,
// the iterators could walk past the end of the array.
// Erases the element at index i from the array.
inline void erase(int i);
// Erases the element at index i from the array.
// Fast but unsafe: only use if has_index(i) is true.
inline void erase_existing(int i);
private:
// Add the index i to the array.
// Only use if has_index(i) is known to be false.
// Since it doesn't set the value associated with i,
// this function is private, only intended as a helper
// for other methods.
inline void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
inline void DebugCheckInvariants() const;
int size_;
int max_size_;
int* sparse_to_dense_;
vector<IndexValue> dense_;
bool valgrind_;
DISALLOW_COPY_AND_ASSIGN(SparseArray);
};
template<typename Value>
SparseArray<Value>::SparseArray()
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {}
// IndexValue pairs: exposed in SparseArray::iterator.
template<typename Value>
class SparseArray<Value>::IndexValue {
friend class SparseArray;
public:
typedef int first_type;
typedef Value second_type;
IndexValue() {}
IndexValue(int index, const Value& value) : second(value), index_(index) {}
int index() const { return index_; }
Value value() const { return second; }
// Provide the data in the 'second' member so that the utilities
// in map-util work.
Value second;
private:
int index_;
};
template<typename Value>
const typename SparseArray<Value>::IndexValue&
SparseArray<Value>::iv(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, size_);
return dense_[i];
}
// Change the maximum size of the array.
// Invalidates all iterators.
template<typename Value>
void SparseArray<Value>::resize(int new_max_size) {
DebugCheckInvariants();
if (new_max_size > max_size_) {
int* a = new int[new_max_size];
if (sparse_to_dense_) {
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
// Don't need to zero the memory but appease Valgrind.
if (valgrind_) {
for (int i = max_size_; i < new_max_size; i++)
a[i] = 0xababababU;
}
delete[] sparse_to_dense_;
}
sparse_to_dense_ = a;
dense_.resize(new_max_size);
}
max_size_ = new_max_size;
if (size_ > max_size_)
size_ = max_size_;
DebugCheckInvariants();
}
// Check whether index i is in the array.
template<typename Value>
bool SparseArray<Value>::has_index(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, max_size_);
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
return (uint)sparse_to_dense_[i] < (uint)size_ &&
dense_[sparse_to_dense_[i]].index_ == i;
}
// Set the value at index i to v.
template<typename Value>
typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
DebugCheckInvariants();
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!has_index(i))
create_index(i);
return set_existing(i, v);
}
template<typename Value>
pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
const value_type& new_value) {
DebugCheckInvariants();
pair<typename SparseArray<Value>::iterator, bool> p;
if (has_index(new_value.index_)) {
p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
} else {
p = make_pair(set_new(new_value.index_, new_value.second), true);
}
DebugCheckInvariants();
return p;
}
template<typename Value>
Value SparseArray<Value>::get(int i, Value defaultv) const {
if (!has_index(i))
return defaultv;
return get_existing(i);
}
template<typename Value>
typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
if (has_index(i))
return dense_.begin() + sparse_to_dense_[i];
return end();
}
template<typename Value>
typename SparseArray<Value>::const_iterator
SparseArray<Value>::find(int i) const {
if (has_index(i)) {
return dense_.begin() + sparse_to_dense_[i];
}
return end();
}
template<typename Value>
typename SparseArray<Value>::iterator
SparseArray<Value>::set_existing(int i, Value v) {
DebugCheckInvariants();
DCHECK(has_index(i));
dense_[sparse_to_dense_[i]].second = v;
DebugCheckInvariants();
return dense_.begin() + sparse_to_dense_[i];
}
template<typename Value>
typename SparseArray<Value>::iterator
SparseArray<Value>::set_new(int i, Value v) {
DebugCheckInvariants();
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
DCHECK(!has_index(i));
create_index(i);
return set_existing(i, v);
}
template<typename Value>
Value SparseArray<Value>::get_existing(int i) const {
DCHECK(has_index(i));
return dense_[sparse_to_dense_[i]].second;
}
template<typename Value>
void SparseArray<Value>::erase(int i) {
DebugCheckInvariants();
if (has_index(i))
erase_existing(i);
DebugCheckInvariants();
}
template<typename Value>
void SparseArray<Value>::erase_existing(int i) {
DebugCheckInvariants();
DCHECK(has_index(i));
int di = sparse_to_dense_[i];
if (di < size_ - 1) {
dense_[di] = dense_[size_ - 1];
sparse_to_dense_[dense_[di].index_] = di;
}
size_--;
DebugCheckInvariants();
}
template<typename Value>
void SparseArray<Value>::create_index(int i) {
DCHECK(!has_index(i));
DCHECK_LT(size_, max_size_);
sparse_to_dense_[i] = size_;
dense_[size_].index_ = i;
size_++;
}
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
max_size_ = max_size;
sparse_to_dense_ = new int[max_size];
valgrind_ = RunningOnValgrind();
dense_.resize(max_size);
// Don't need to zero the new memory, but appease Valgrind.
if (valgrind_) {
for (int i = 0; i < max_size; i++) {
sparse_to_dense_[i] = 0xababababU;
dense_[i].index_ = 0xababababU;
}
}
size_ = 0;
DebugCheckInvariants();
}
template<typename Value> SparseArray<Value>::~SparseArray() {
DebugCheckInvariants();
delete[] sparse_to_dense_;
}
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
DCHECK_LE(0, size_);
DCHECK_LE(size_, max_size_);
DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
}
// Comparison function for sorting.
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
const IndexValue& b) {
return a.index_ < b.index_;
}
} // namespace re2
#endif // RE2_UTIL_SPARSE_ARRAY_H__

View File

@@ -0,0 +1,188 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// DESCRIPTION
//
// SparseSet<T>(m) is a set of integers in [0, m).
// It requires sizeof(int)*m memory, but it provides
// fast iteration through the elements in the set and fast clearing
// of the set.
//
// Insertion and deletion are constant time operations.
//
// Allocating the set is a constant time operation
// when memory allocation is a constant time operation.
//
// Clearing the set is a constant time operation (unusual!).
//
// Iterating through the set is an O(n) operation, where n
// is the number of items in the set (not O(m)).
//
// The set iterator visits entries in the order they were first
// inserted into the array. It is safe to add items to the set while
// using an iterator: the iterator will visit indices added to the set
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseSet can be a convenient
// implementation of a work queue.
//
// The SparseSet implementation is NOT thread-safe. It is up to the
// caller to make sure only one thread is accessing the set. (Typically
// these sets are temporary values and used in situations where speed is
// important.)
//
// The SparseSet interface does not present all the usual STL bells and
// whistles.
//
// Implemented with reference to Briggs & Torczon, An Efficient
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// For a generalization to sparse array, see sparse_array.h.
// IMPLEMENTATION
//
// See sparse_array.h for implementation details
#ifndef RE2_UTIL_SPARSE_SET_H__
#define RE2_UTIL_SPARSE_SET_H__
#include "re2/util/util.h"
namespace re2 {
static bool InitMemory() {
#ifdef MEMORY_SANITIZER
return true;
#else
return RunningOnValgrind();
#endif
}
class SparseSet {
public:
SparseSet()
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL),
init_memory_(InitMemory()) {}
SparseSet(int max_size) {
max_size_ = max_size;
sparse_to_dense_ = new int[max_size];
dense_ = new int[max_size];
init_memory_ = InitMemory();
// Don't need to zero the memory, but do so anyway
// to appease Valgrind.
if (init_memory_) {
for (int i = 0; i < max_size; i++) {
dense_[i] = 0xababababU;
sparse_to_dense_[i] = 0xababababU;
}
}
size_ = 0;
}
~SparseSet() {
delete[] sparse_to_dense_;
delete[] dense_;
}
typedef int* iterator;
typedef const int* const_iterator;
int size() const { return size_; }
iterator begin() { return dense_; }
iterator end() { return dense_ + size_; }
const_iterator begin() const { return dense_; }
const_iterator end() const { return dense_ + size_; }
// Change the maximum size of the array.
// Invalidates all iterators.
void resize(int new_max_size) {
if (size_ > new_max_size)
size_ = new_max_size;
if (new_max_size > max_size_) {
int* a = new int[new_max_size];
if (sparse_to_dense_) {
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
if (init_memory_) {
for (int i = max_size_; i < new_max_size; i++)
a[i] = 0xababababU;
}
delete[] sparse_to_dense_;
}
sparse_to_dense_ = a;
a = new int[new_max_size];
if (dense_) {
memmove(a, dense_, size_*sizeof a[0]);
if (init_memory_) {
for (int i = size_; i < new_max_size; i++)
a[i] = 0xababababU;
}
delete[] dense_;
}
dense_ = a;
}
max_size_ = new_max_size;
}
// Return the maximum size of the array.
// Indices can be in the range [0, max_size).
int max_size() const { return max_size_; }
// Clear the array.
void clear() { size_ = 0; }
// Check whether i is in the array.
bool contains(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, max_size_);
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
return (uint)sparse_to_dense_[i] < (uint)size_ &&
dense_[sparse_to_dense_[i]] == i;
}
// Adds i to the set.
void insert(int i) {
if (!contains(i))
insert_new(i);
}
// Set the value at the new index i to v.
// Fast but unsafe: only use if contains(i) is false.
void insert_new(int i) {
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return;
}
DCHECK(!contains(i));
DCHECK_LT(size_, max_size_);
sparse_to_dense_[i] = size_;
dense_[size_] = i;
size_++;
}
// Comparison function for sorting.
// Can sort the sparse array so that future iterations
// will visit indices in increasing order using
// sort(arr.begin(), arr.end(), arr.less);
static bool less(int a, int b) { return a < b; }
private:
int size_;
int max_size_;
int* sparse_to_dense_;
int* dense_;
bool init_memory_;
DISALLOW_COPY_AND_ASSIGN(SparseSet);
};
} // namespace re2
#endif // RE2_UTIL_SPARSE_SET_H__

View File

@@ -0,0 +1,82 @@
// Copyright 2002 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/util/util.h"
namespace re2 {
static void StringAppendV(string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
char space[1024];
// It's possible for methods that use a va_list to invalidate
// the data in it upon use. The fix is to make a copy
// of the structure before using it and use that copy instead.
va_list backup_ap;
va_copy(backup_ap, ap);
int result = vsnprintf(space, sizeof(space), format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (static_cast<unsigned long>(result) < sizeof(space))) {
// It fit
dst->append(space, result);
return;
}
// Repeatedly increase buffer size until it fits
int length = sizeof(space);
while (true) {
if (result < 0) {
// Older behavior: just try doubling the buffer size
length *= 2;
} else {
// We need exactly "result+1" characters
length = result+1;
}
char* buf = new char[length];
// Restore the va_list before we use it again
va_copy(backup_ap, ap);
#ifdef WIN32
result = vsnprintf_s(buf, length, length, format, backup_ap);
#else
result = vsnprintf(buf, length, format, backup_ap);
#endif
va_end(backup_ap);
if ((result >= 0) && (result < length)) {
// It fit
dst->append(buf, result);
delete[] buf;
return;
}
delete[] buf;
}
}
string StringPrintf(const char* format, ...) {
va_list ap;
va_start(ap, format);
string result;
StringAppendV(&result, format, ap);
va_end(ap);
return result;
}
void SStringPrintf(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
dst->clear();
StringAppendV(dst, format, ap);
va_end(ap);
}
void StringAppendF(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
StringAppendV(dst, format, ap);
va_end(ap);
}
} // namespace re2

View File

@@ -0,0 +1,102 @@
// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// ----------------------------------------------------------------------
// CEscapeString()
// Copies 'src' to 'dest', escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// Returns the number of bytes written to 'dest' (not including the \0)
// or -1 if there was insufficient space.
// ----------------------------------------------------------------------
int CEscapeString(const char* src, int src_len, char* dest,
int dest_len) {
const char* src_end = src + src_len;
int used = 0;
for (; src < src_end; src++) {
if (dest_len - used < 2) // space for two-character escape
return -1;
unsigned char c = *src;
switch (c) {
case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
default:
// Note that if we emit \xNN and the src character after that is a hex
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if (c < ' ' || c > '~') {
if (dest_len - used < 5) // space for four-character escape + \0
return -1;
#ifdef WIN32
sprintf_s(dest + used, dest_len, "\\%03o", c);
#else
sprintf(dest + used, "\\%03o", c);
#endif
used += 4;
} else {
dest[used++] = c; break;
}
}
}
if (dest_len - used < 1) // make sure that there is room for \0
return -1;
dest[used] = '\0'; // doesn't count towards return value though
return used;
}
// ----------------------------------------------------------------------
// CEscape()
// Copies 'src' to result, escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// ----------------------------------------------------------------------
string CEscape(const StringPiece& src) {
const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
char* dest = new char[dest_length];
const int len = CEscapeString(src.data(), src.size(),
dest, dest_length);
string s = string(dest, len);
delete[] dest;
return s;
}
string PrefixSuccessor(const StringPiece& prefix) {
// We can increment the last character in the string and be done
// unless that character is 255, in which case we have to erase the
// last character and increment the previous character, unless that
// is 255, etc. If the string is empty or consists entirely of
// 255's, we just return the empty string.
bool done = false;
string limit(prefix.data(), prefix.size());
int index = limit.length() - 1;
while (!done && index >= 0) {
if ((limit[index]&255) == 255) {
limit.erase(index);
index--;
} else {
limit[index]++;
done = true;
}
}
if (!done) {
return "";
} else {
return limit;
}
}
} // namespace re2

View File

@@ -0,0 +1,50 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_TEST_H__
#define RE2_UTIL_TEST_H__
#include "util/util.h"
#include "util/flags.h"
#define TEST(x, y) \
void x##y(void); \
TestRegisterer r##x##y(x##y, # x "." # y); \
void x##y(void)
void RegisterTest(void (*)(void), const char*);
class TestRegisterer {
public:
TestRegisterer(void (*fn)(void), const char *s) {
RegisterTest(fn, s);
}
};
// TODO(rsc): Do a better job.
#define EXPECT_EQ CHECK_EQ
#define EXPECT_TRUE CHECK
#define EXPECT_LT CHECK_LT
#define EXPECT_GT CHECK_GT
#define EXPECT_LE CHECK_LE
#define EXPECT_GE CHECK_GE
#define EXPECT_FALSE(x) CHECK(!(x))
const bool UsingMallocCounter = false;
namespace testing {
class MallocCounter {
public:
MallocCounter(int x) { }
static const int THIS_THREAD_ONLY = 0;
long long HeapGrowth() { return 0; }
long long PeakHeapGrowth() { return 0; }
void Reset() { }
};
} // namespace testing
namespace re2 {
int64 VirtualProcessSize();
} // namespace re2
#endif // RE2_UTIL_TEST_H__

View File

@@ -0,0 +1,26 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_THREAD_H__
#define RE2_UTIL_THREAD_H__
#include <pthread.h>
class Thread {
public:
Thread();
virtual ~Thread();
void Start();
void Join();
void SetJoinable(bool);
virtual void Run() = 0;
private:
pthread_t pid_;
bool running_;
bool joinable_;
};
#endif // RE2_UTIL_THREAD_H__

View File

@@ -0,0 +1,43 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*
* This file and rune.cc have been converted to compile as C++ code
* in name space re2.
*/
#ifndef RE2_UTIL_UTF_H__
#define RE2_UTIL_UTF_H__
#include <stdint.h>
namespace re2 {
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
int runetochar(char* s, const Rune* r);
int chartorune(Rune* r, const char* s);
int fullrune(const char* s, int n);
int utflen(const char* s);
char* utfrune(const char*, Rune);
} // namespace re2
#endif // RE2_UTIL_UTF_H__

View File

@@ -0,0 +1,154 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_UTIL_H__
#define RE2_UTIL_UTIL_H__
// C
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stddef.h> // For size_t
#include <assert.h>
#include <stdarg.h>
#include <time.h> // For clock_gettime, CLOCK_REALTIME
#include <ctype.h> // For isdigit, isalpha
#if !defined(_WIN32)
#include <sys/time.h> // For gettimeofday
#endif
// C++
#include <ctime>
#include <vector>
#include <string>
#include <algorithm>
#include <iosfwd>
#include <map>
#include <stack>
#include <ostream>
#include <utility>
#include <set>
// Use std names.
using std::set;
using std::pair;
using std::vector;
using std::string;
using std::min;
using std::max;
using std::ostream;
using std::map;
using std::stack;
using std::sort;
using std::swap;
using std::make_pair;
#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION)
#include <tr1/unordered_set>
using std::tr1::unordered_set;
#else
#include <unordered_set>
#if defined(_WIN32)
using std::tr1::unordered_set;
#else
using std::unordered_set;
#endif
#endif
#ifdef _WIN32
#define snprintf _snprintf_s
#define sprintf sprintf_s
#define stricmp _stricmp
#define strtof strtod /* not really correct but best we can do */
#define strtoll _strtoi64
#define strtoull _strtoui64
#define vsnprintf vsnprintf_s
#pragma warning(disable: 4018) // signed/unsigned mismatch
#pragma warning(disable: 4244) // possible data loss in int conversion
#pragma warning(disable: 4800) // conversion from int to bool
#endif
namespace re2 {
typedef int8_t int8;
typedef uint8_t uint8;
typedef int16_t int16;
typedef uint16_t uint16;
typedef int32_t int32;
typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
typedef unsigned long ulong;
typedef unsigned int uint;
typedef unsigned short ushort;
// Prevent the compiler from complaining about or optimizing away variables
// that appear unused.
#undef ATTRIBUTE_UNUSED
#if defined(__GNUC__)
#define ATTRIBUTE_UNUSED __attribute__ ((unused))
#else
#define ATTRIBUTE_UNUSED
#endif
// COMPILE_ASSERT causes a compile error about msg if expr is not true.
#if __cplusplus >= 201103L
#define COMPILE_ASSERT(expr, msg) static_assert(expr, #msg)
#else
template<bool> struct CompileAssert {};
#define COMPILE_ASSERT(expr, msg) \
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] ATTRIBUTE_UNUSED
#endif
// DISALLOW_COPY_AND_ASSIGN disallows the copy and operator= functions.
// It goes in the private: declarations in a class.
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&)
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
class StringPiece;
string CEscape(const StringPiece& src);
int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
extern string StringPrintf(const char* format, ...);
extern void SStringPrintf(string* dst, const char* format, ...);
extern void StringAppendF(string* dst, const char* format, ...);
extern string PrefixSuccessor(const StringPiece& prefix);
uint32 hashword(const uint32*, size_t, uint32);
void hashword2(const uint32*, size_t, uint32*, uint32*);
static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
return hashword((uint32*)s, len/4, seed);
}
static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
uint32 x, y;
x = seed;
y = 0;
hashword2((uint32*)s, len/4, &x, &y);
return ((uint64)x << 32) | y;
}
int RunningOnValgrind();
} // namespace re2
#include "re2/util/logging.h"
#include "re2/util/mutex.h"
#include "re2/util/utf.h"
#endif // RE2_UTIL_UTIL_H__

View File

@@ -0,0 +1,26 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/util/util.h"
#ifndef _WIN32
#include "re2/util/valgrind.h"
#endif
namespace re2 {
#ifndef __has_feature
#define __has_feature(x) 0
#endif
int RunningOnValgrind() {
#if __has_feature(memory_sanitizer)
return true;
#elif defined(RUNNING_ON_VALGRIND)
return RUNNING_ON_VALGRIND;
#else
return 0;
#endif
}
} // namespace re2

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,344 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_VARIADIC_FUNCTION_H_
#define RE2_VARIADIC_FUNCTION_H_
namespace re2 {
template <typename Result, typename Param0, typename Param1, typename Arg,
Result (*Func)(Param0, Param1, const Arg* const [], int count)>
class VariadicFunction2 {
public:
Result operator()(Param0 p0, Param1 p1) const {
return Func(p0, p1, 0, 0);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
const Arg* const args[] = { &a0 };
return Func(p0, p1, args, 1);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
const Arg* const args[] = { &a0, &a1 };
return Func(p0, p1, args, 2);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2) const {
const Arg* const args[] = { &a0, &a1, &a2 };
return Func(p0, p1, args, 3);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3 };
return Func(p0, p1, args, 4);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
return Func(p0, p1, args, 5);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
return Func(p0, p1, args, 6);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
return Func(p0, p1, args, 7);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
return Func(p0, p1, args, 8);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
return Func(p0, p1, args, 9);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9 };
return Func(p0, p1, args, 10);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10 };
return Func(p0, p1, args, 11);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11 };
return Func(p0, p1, args, 12);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12 };
return Func(p0, p1, args, 13);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13 };
return Func(p0, p1, args, 14);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14 };
return Func(p0, p1, args, 15);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15 };
return Func(p0, p1, args, 16);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
return Func(p0, p1, args, 17);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
return Func(p0, p1, args, 18);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
return Func(p0, p1, args, 19);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
return Func(p0, p1, args, 20);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
&a20 };
return Func(p0, p1, args, 21);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21 };
return Func(p0, p1, args, 22);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22 };
return Func(p0, p1, args, 23);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23 };
return Func(p0, p1, args, 24);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24 };
return Func(p0, p1, args, 25);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25 };
return Func(p0, p1, args, 26);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26 };
return Func(p0, p1, args, 27);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27 };
return Func(p0, p1, args, 28);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
return Func(p0, p1, args, 29);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
return Func(p0, p1, args, 30);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
return Func(p0, p1, args, 31);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30, const Arg& a31) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
return Func(p0, p1, args, 32);
}
};
} // namespace re2
#endif // RE2_VARIADIC_FUNCTION_H_

View File

@@ -0,0 +1,244 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
// and after visiting the subexpressions.
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
#ifndef RE2_WALKER_INL_H__
#define RE2_WALKER_INL_H__
#include "re2/regexp.h"
namespace re2 {
template<typename T> struct WalkState;
template<typename T> class Regexp::Walker {
public:
Walker();
virtual ~Walker();
// Virtual method called before visiting re's children.
// PreVisit passes ownership of its return value to its caller.
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
// and passed to the child PreVisits and PostVisits as parent_arg.
// At the top-most Regexp, parent_arg is arg passed to walk.
// If PreVisit sets *stop to true, the walk does not recurse
// into the children. Instead it behaves as though the return
// value from PreVisit is the return value from PostVisit.
// The default PreVisit returns parent_arg.
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
// Virtual method called after visiting re's children.
// The pre_arg is the T that PreVisit returned.
// The child_args is a vector of the T that the child PostVisits returned.
// PostVisit takes ownership of pre_arg.
// PostVisit takes ownership of the Ts
// in *child_args, but not the vector itself.
// PostVisit passes ownership of its return value
// to its caller.
// The default PostVisit simply returns pre_arg.
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
T* child_args, int nchild_args);
// Virtual method called to copy a T,
// when Walk notices that more than one child is the same re.
virtual T Copy(T arg);
// Virtual method called to do a "quick visit" of the re,
// but not its children. Only called once the visit budget
// has been used up and we're trying to abort the walk
// as quickly as possible. Should return a value that
// makes sense for the parent PostVisits still to be run.
// This function is (hopefully) only called by
// WalkExponential, but must be implemented by all clients,
// just in case.
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
// Walks over a regular expression.
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
// Returns the T returned by PostVisit on re.
T Walk(Regexp* re, T top_arg);
// Like Walk, but doesn't use Copy. This can lead to
// exponential runtimes on cross-linked Regexps like the
// ones generated by Simplify. To help limit this,
// at most max_visits nodes will be visited and then
// the walk will be cut off early.
// If the walk *is* cut off early, ShortVisit(re)
// will be called on regexps that cannot be fully
// visited rather than calling PreVisit/PostVisit.
T WalkExponential(Regexp* re, T top_arg, int max_visits);
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
void Reset();
// Returns whether walk was cut off.
bool stopped_early() { return stopped_early_; }
private:
// Walk state for the entire traversal.
stack<WalkState<T> >* stack_;
bool stopped_early_;
int max_visits_;
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
DISALLOW_COPY_AND_ASSIGN(Walker);
};
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
T parent_arg,
bool* stop) {
return parent_arg;
}
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
T parent_arg,
T pre_arg,
T* child_args,
int nchild_args) {
return pre_arg;
}
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
return arg;
}
// State about a single level in the traversal.
template<typename T> struct WalkState {
WalkState<T>(Regexp* re, T parent)
: re(re),
n(-1),
parent_arg(parent),
child_args(NULL) { }
Regexp* re; // The regexp
int n; // The index of the next child to process; -1 means need to PreVisit
T parent_arg; // Accumulated arguments.
T pre_arg;
T child_arg; // One-element buffer for child_args.
T* child_args;
};
template<typename T> Regexp::Walker<T>::Walker() {
stack_ = new stack<WalkState<T> >;
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
delete stack_;
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
if (stack_ && stack_->size() > 0) {
LOG(DFATAL) << "Stack not empty.";
while (stack_->size() > 0) {
delete stack_->top().child_args;
stack_->pop();
}
}
}
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
bool use_copy) {
Reset();
if (re == NULL) {
LOG(DFATAL) << "Walk NULL";
return top_arg;
}
stack_->push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
s = &stack_->top();
Regexp* re = s->re;
switch (s->n) {
case -1: {
if (--max_visits_ < 0) {
stopped_early_ = true;
t = ShortVisit(re, s->parent_arg);
break;
}
bool stop = false;
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
if (stop) {
t = s->pre_arg;
break;
}
s->n = 0;
s->child_args = NULL;
if (re->nsub_ == 1)
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
// Fall through.
}
default: {
if (re->nsub_ > 0) {
Regexp** sub = re->sub();
if (s->n < re->nsub_) {
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
}
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
if (re->nsub_ > 1)
delete[] s->child_args;
break;
}
}
// We've finished stack_->top().
// Update next guy down.
stack_->pop();
if (stack_->size() == 0)
return t;
s = &stack_->top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else
s->child_arg = t;
s->n++;
}
}
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
// Without the exponential walking behavior,
// this budget should be more than enough for any
// regexp, and yet not enough to get us in trouble
// as far as CPU time.
max_visits_ = 1000000;
return WalkInternal(re, top_arg, true);
}
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
int max_visits) {
max_visits_ = max_visits;
return WalkInternal(re, top_arg, false);
}
} // namespace re2
#endif // RE2_WALKER_INL_H__

View File

@@ -1,58 +0,0 @@
/**********************************************************************
ascii.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regenc.h"
static int
ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype)
{
if (code < 128)
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
else
return FALSE;
}
OnigEncodingType OnigEncodingASCII = {
onigenc_single_byte_mbc_enc_len,
"US-ASCII", /* name */
1, /* max byte length */
1, /* min byte length */
onigenc_is_mbc_newline_0x0a,
onigenc_single_byte_mbc_to_code,
onigenc_single_byte_code_to_mbclen,
onigenc_single_byte_code_to_mbc,
onigenc_ascii_mbc_case_fold,
onigenc_ascii_apply_all_case_fold,
onigenc_ascii_get_case_fold_codes_by_str,
onigenc_minimum_property_name_to_ctype,
ascii_is_code_ctype,
onigenc_not_support_get_ctype_code_range,
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match
};

View File

@@ -1,130 +0,0 @@
/* config.h. Generated from config.h.in by configure. */
/* config.h.in. Generated from configure.in by autoheader. */
#ifdef _WIN32
#include "win32/onig_config.h"
#else
/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
systems. This function is required for `alloca.c' support on those systems.
*/
/* #undef CRAY_STACKSEG_END */
/* Define to 1 if using `alloca.c'. */
/* #undef C_ALLOCA */
/* Define to 1 if you have `alloca', as a function or macro. */
#define HAVE_ALLOCA 1
/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
*/
#define HAVE_ALLOCA_H 1
/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1
/* Define if compilerr supports prototypes */
#define HAVE_PROTOTYPES 1
/* Define if compiler supports stdarg prototypes */
#define HAVE_STDARG_PROTOTYPES 1
/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1
/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1
/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1
/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1
/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1
/* Define to 1 if you have the <sys/times.h> header file. */
#define HAVE_SYS_TIMES_H 1
/* Define to 1 if you have the <sys/time.h> header file. */
#define HAVE_SYS_TIME_H 1
/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1
/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#define LT_OBJDIR ".libs/"
/* Name of package */
#define PACKAGE "onig"
/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT ""
/* Define to the full name of this package. */
#define PACKAGE_NAME "onig"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "onig 5.9.6"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "onig"
/* Define to the home page for this package. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "5.9.6"
/* The size of `int', as computed by sizeof. */
#define SIZEOF_INT 4
/* The size of `long', as computed by sizeof. */
#define SIZEOF_LONG 8
/* The size of `short', as computed by sizeof. */
#define SIZEOF_SHORT 2
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be
automatically deduced at runtime.
STACK_DIRECTION > 0 => grows toward higher addresses
STACK_DIRECTION < 0 => grows toward lower addresses
STACK_DIRECTION = 0 => direction of growth unknown */
/* #undef STACK_DIRECTION */
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1
/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
#define TIME_WITH_SYS_TIME 1
/* Define if combination explosion check */
/* #undef USE_COMBINATION_EXPLOSION_CHECK */
/* Define if enable CR+NL as line terminator */
/* #undef USE_CRNL_AS_LINE_TERMINATOR */
/* Version number of package */
#define VERSION "5.9.6"
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */
/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */
#endif

View File

@@ -1,85 +0,0 @@
#ifndef ONIGGNU_H
#define ONIGGNU_H
/**********************************************************************
oniggnu.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "oniguruma.h"
#ifdef __cplusplus
extern "C" {
#endif
#define RE_MBCTYPE_ASCII 0
#define RE_MBCTYPE_EUC 1
#define RE_MBCTYPE_SJIS 2
#define RE_MBCTYPE_UTF8 3
/* GNU regex options */
#ifndef RE_NREGS
#define RE_NREGS ONIG_NREGION
#endif
#define RE_OPTION_IGNORECASE ONIG_OPTION_IGNORECASE
#define RE_OPTION_EXTENDED ONIG_OPTION_EXTEND
#define RE_OPTION_MULTILINE ONIG_OPTION_MULTILINE
#define RE_OPTION_SINGLELINE ONIG_OPTION_SINGLELINE
#define RE_OPTION_LONGEST ONIG_OPTION_FIND_LONGEST
#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE)
#define RE_OPTION_FIND_NOT_EMPTY ONIG_OPTION_FIND_NOT_EMPTY
#define RE_OPTION_NEGATE_SINGLELINE ONIG_OPTION_NEGATE_SINGLELINE
#define RE_OPTION_DONT_CAPTURE_GROUP ONIG_OPTION_DONT_CAPTURE_GROUP
#define RE_OPTION_CAPTURE_GROUP ONIG_OPTION_CAPTURE_GROUP
ONIG_EXTERN
void re_mbcinit P_((int));
ONIG_EXTERN
int re_compile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf));
ONIG_EXTERN
int re_recompile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf));
ONIG_EXTERN
void re_free_pattern P_((struct re_pattern_buffer*));
ONIG_EXTERN
int re_adjust_startpos P_((struct re_pattern_buffer*, const char*, int, int, int));
ONIG_EXTERN
int re_search P_((struct re_pattern_buffer*, const char*, int, int, int, struct re_registers*));
ONIG_EXTERN
int re_match P_((struct re_pattern_buffer*, const char *, int, int, struct re_registers*));
ONIG_EXTERN
void re_set_casetable P_((const char*));
ONIG_EXTERN
void re_free_registers P_((struct re_registers*));
ONIG_EXTERN
int re_alloc_pattern P_((struct re_pattern_buffer**)); /* added */
#ifdef __cplusplus
}
#endif
#endif /* ONIGGNU_H */

View File

@@ -1,169 +0,0 @@
#ifndef ONIGPOSIX_H
#define ONIGPOSIX_H
/**********************************************************************
onigposix.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
/* options */
#define REG_ICASE (1<<0)
#define REG_NEWLINE (1<<1)
#define REG_NOTBOL (1<<2)
#define REG_NOTEOL (1<<3)
#define REG_EXTENDED (1<<4) /* if not setted, Basic Onigular Expression */
#define REG_NOSUB (1<<5)
/* POSIX error codes */
#define REG_NOMATCH 1
#define REG_BADPAT 2
#define REG_ECOLLATE 3
#define REG_ECTYPE 4
#define REG_EESCAPE 5
#define REG_ESUBREG 6
#define REG_EBRACK 7
#define REG_EPAREN 8
#define REG_EBRACE 9
#define REG_BADBR 10
#define REG_ERANGE 11
#define REG_ESPACE 12
#define REG_BADRPT 13
/* extended error codes */
#define REG_EONIG_INTERNAL 14
#define REG_EONIG_BADWC 15
#define REG_EONIG_BADARG 16
#define REG_EONIG_THREAD 17
/* character encodings (for reg_set_encoding()) */
#define REG_POSIX_ENCODING_ASCII 0
#define REG_POSIX_ENCODING_EUC_JP 1
#define REG_POSIX_ENCODING_SJIS 2
#define REG_POSIX_ENCODING_UTF8 3
#define REG_POSIX_ENCODING_UTF16_BE 4
#define REG_POSIX_ENCODING_UTF16_LE 5
typedef int regoff_t;
typedef struct {
regoff_t rm_so;
regoff_t rm_eo;
} regmatch_t;
/* POSIX regex_t */
typedef struct {
void* onig; /* Oniguruma regex_t* */
size_t re_nsub;
int comp_options;
} regex_t;
#ifndef P_
#if defined(__STDC__) || defined(_WIN32)
# define P_(args) args
#else
# define P_(args) ()
#endif
#endif
#ifndef ONIG_EXTERN
#if defined(_WIN32) && !defined(__GNUC__)
#if defined(EXPORT)
#define ONIG_EXTERN extern __declspec(dllexport)
#else
#define ONIG_EXTERN extern __declspec(dllimport)
#endif
#endif
#endif
#ifndef ONIG_EXTERN
#define ONIG_EXTERN extern
#endif
#ifndef ONIGURUMA_H
typedef unsigned int OnigOptionType;
/* syntax */
typedef struct {
unsigned int op;
unsigned int op2;
unsigned int behavior;
OnigOptionType options; /* default option */
} OnigSyntaxType;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixBasic;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixExtended;
ONIG_EXTERN OnigSyntaxType OnigSyntaxEmacs;
ONIG_EXTERN OnigSyntaxType OnigSyntaxGrep;
ONIG_EXTERN OnigSyntaxType OnigSyntaxGnuRegex;
ONIG_EXTERN OnigSyntaxType OnigSyntaxJava;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl;
ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby;
/* predefined syntaxes (see regsyntax.c) */
#define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic)
#define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended)
#define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs)
#define ONIG_SYNTAX_GREP (&OnigSyntaxGrep)
#define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex)
#define ONIG_SYNTAX_JAVA (&OnigSyntaxJava)
#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl)
#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby)
/* default syntax */
#define ONIG_SYNTAX_DEFAULT OnigDefaultSyntax
ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
ONIG_EXTERN int onig_set_default_syntax P_((OnigSyntaxType* syntax));
ONIG_EXTERN void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from));
ONIG_EXTERN const char* onig_version P_((void));
ONIG_EXTERN const char* onig_copyright P_((void));
#endif /* ONIGURUMA_H */
ONIG_EXTERN int regcomp P_((regex_t* reg, const char* pat, int options));
ONIG_EXTERN int regexec P_((regex_t* reg, const char* str, size_t nmatch, regmatch_t* matches, int options));
ONIG_EXTERN void regfree P_((regex_t* reg));
ONIG_EXTERN size_t regerror P_((int code, const regex_t* reg, char* buf, size_t size));
/* extended API */
ONIG_EXTERN void reg_set_encoding P_((int enc));
ONIG_EXTERN int reg_name_to_group_numbers P_((regex_t* reg, const unsigned char* name, const unsigned char* name_end, int** nums));
ONIG_EXTERN int reg_foreach_name P_((regex_t* reg, int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*), void* arg));
ONIG_EXTERN int reg_number_of_names P_((regex_t* reg));
#ifdef __cplusplus
}
#endif
#endif /* ONIGPOSIX_H */

View File

@@ -1,827 +0,0 @@
#ifndef ONIGURUMA_H
#define ONIGURUMA_H
/**********************************************************************
oniguruma.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2009 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifdef __cplusplus
extern "C" {
#endif
#define ONIGURUMA
#define ONIGURUMA_VERSION_MAJOR 5
#define ONIGURUMA_VERSION_MINOR 9
#define ONIGURUMA_VERSION_TEENY 6
#ifdef __cplusplus
# ifndef HAVE_PROTOTYPES
# define HAVE_PROTOTYPES 1
# endif
# ifndef HAVE_STDARG_PROTOTYPES
# define HAVE_STDARG_PROTOTYPES 1
# endif
#endif
/* escape Mac OS X/Xcode 2.4/gcc 4.0.1 problem */
#if defined(__APPLE__) && defined(__GNUC__) && __GNUC__ >= 4
# ifndef HAVE_STDARG_PROTOTYPES
# define HAVE_STDARG_PROTOTYPES 1
# endif
#endif
#ifdef HAVE_STDARG_H
# ifndef HAVE_STDARG_PROTOTYPES
# define HAVE_STDARG_PROTOTYPES 1
# endif
#endif
#ifndef P_
#if defined(__STDC__) || defined(_WIN32)
# define P_(args) args
#else
# define P_(args) ()
#endif
#endif
#ifndef PV_
#ifdef HAVE_STDARG_PROTOTYPES
# define PV_(args) args
#else
# define PV_(args) ()
#endif
#endif
#ifndef ONIG_EXTERN
#if defined(_WIN32) && !defined(__GNUC__)
#if defined(EXPORT) || defined(RUBY_EXPORT)
#define ONIG_EXTERN extern __declspec(dllexport)
#else
#define ONIG_EXTERN extern
#endif
#endif
#endif
#ifndef ONIG_EXTERN
#define ONIG_EXTERN extern
#endif
/* PART: character encoding */
#ifndef ONIG_ESCAPE_UCHAR_COLLISION
#define UChar OnigUChar
#endif
#ifdef _WIN32
# include <windows.h>
typedef ULONG_PTR OnigCodePoint;
#else
typedef unsigned long OnigCodePoint;
#endif
typedef unsigned char OnigUChar;
typedef unsigned int OnigCtype;
typedef unsigned int OnigDistance;
#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0)
typedef unsigned int OnigCaseFoldType; /* case fold flag */
ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag;
/* #define ONIGENC_CASE_FOLD_HIRAGANA_KATAKANA (1<<1) */
/* #define ONIGENC_CASE_FOLD_KATAKANA_WIDTH (1<<2) */
#define ONIGENC_CASE_FOLD_TURKISH_AZERI (1<<20)
#define INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR (1<<30)
#define ONIGENC_CASE_FOLD_MIN INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
#define ONIGENC_CASE_FOLD_DEFAULT OnigDefaultCaseFoldFlag
#define ONIGENC_MAX_COMP_CASE_FOLD_CODE_LEN 3
#define ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM 13
/* 13 => Unicode:0x1ffc */
/* code range */
#define ONIGENC_CODE_RANGE_NUM(range) ((int )range[0])
#define ONIGENC_CODE_RANGE_FROM(range,i) range[((i)*2) + 1]
#define ONIGENC_CODE_RANGE_TO(range,i) range[((i)*2) + 2]
typedef struct {
int byte_len; /* argument(original) character(s) byte length */
int code_len; /* number of code */
OnigCodePoint code[ONIGENC_MAX_COMP_CASE_FOLD_CODE_LEN];
} OnigCaseFoldCodeItem;
typedef struct {
OnigCodePoint esc;
OnigCodePoint anychar;
OnigCodePoint anytime;
OnigCodePoint zero_or_one_time;
OnigCodePoint one_or_more_time;
OnigCodePoint anychar_anytime;
} OnigMetaCharTableType;
typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg);
typedef struct OnigEncodingTypeST {
int (*mbc_enc_len)(const OnigUChar* p);
const char* name;
int max_enc_len;
int min_enc_len;
int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end);
OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end);
int (*code_to_mbclen)(OnigCodePoint code);
int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf);
int (*mbc_case_fold)(OnigCaseFoldType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to);
int (*apply_all_case_fold)(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg);
int (*get_case_fold_codes_by_str)(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem acs[]);
int (*property_name_to_ctype)(struct OnigEncodingTypeST* enc, OnigUChar* p, OnigUChar* end);
int (*is_code_ctype)(OnigCodePoint code, OnigCtype ctype);
int (*get_ctype_code_range)(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[]);
OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p);
int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end);
} OnigEncodingType;
typedef OnigEncodingType* OnigEncoding;
ONIG_EXTERN OnigEncodingType OnigEncodingASCII;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_1;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_2;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_3;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_4;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_5;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_6;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_7;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_8;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_9;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_10;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_11;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_13;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_14;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_15;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_16;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF8;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_BE;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_LE;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_BE;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_LE;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_JP;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_TW;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_KR;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_CN;
ONIG_EXTERN OnigEncodingType OnigEncodingSJIS;
ONIG_EXTERN OnigEncodingType OnigEncodingKOI8;
ONIG_EXTERN OnigEncodingType OnigEncodingKOI8_R;
ONIG_EXTERN OnigEncodingType OnigEncodingCP1251;
ONIG_EXTERN OnigEncodingType OnigEncodingBIG5;
ONIG_EXTERN OnigEncodingType OnigEncodingGB18030;
#define ONIG_ENCODING_ASCII (&OnigEncodingASCII)
#define ONIG_ENCODING_ISO_8859_1 (&OnigEncodingISO_8859_1)
#define ONIG_ENCODING_ISO_8859_2 (&OnigEncodingISO_8859_2)
#define ONIG_ENCODING_ISO_8859_3 (&OnigEncodingISO_8859_3)
#define ONIG_ENCODING_ISO_8859_4 (&OnigEncodingISO_8859_4)
#define ONIG_ENCODING_ISO_8859_5 (&OnigEncodingISO_8859_5)
#define ONIG_ENCODING_ISO_8859_6 (&OnigEncodingISO_8859_6)
#define ONIG_ENCODING_ISO_8859_7 (&OnigEncodingISO_8859_7)
#define ONIG_ENCODING_ISO_8859_8 (&OnigEncodingISO_8859_8)
#define ONIG_ENCODING_ISO_8859_9 (&OnigEncodingISO_8859_9)
#define ONIG_ENCODING_ISO_8859_10 (&OnigEncodingISO_8859_10)
#define ONIG_ENCODING_ISO_8859_11 (&OnigEncodingISO_8859_11)
#define ONIG_ENCODING_ISO_8859_13 (&OnigEncodingISO_8859_13)
#define ONIG_ENCODING_ISO_8859_14 (&OnigEncodingISO_8859_14)
#define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15)
#define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16)
#define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8)
#define ONIG_ENCODING_UTF16_BE (&OnigEncodingUTF16_BE)
#define ONIG_ENCODING_UTF16_LE (&OnigEncodingUTF16_LE)
#define ONIG_ENCODING_UTF32_BE (&OnigEncodingUTF32_BE)
#define ONIG_ENCODING_UTF32_LE (&OnigEncodingUTF32_LE)
#define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP)
#define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW)
#define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR)
#define ONIG_ENCODING_EUC_CN (&OnigEncodingEUC_CN)
#define ONIG_ENCODING_SJIS (&OnigEncodingSJIS)
#define ONIG_ENCODING_KOI8 (&OnigEncodingKOI8)
#define ONIG_ENCODING_KOI8_R (&OnigEncodingKOI8_R)
#define ONIG_ENCODING_CP1251 (&OnigEncodingCP1251)
#define ONIG_ENCODING_BIG5 (&OnigEncodingBIG5)
#define ONIG_ENCODING_GB18030 (&OnigEncodingGB18030)
#define ONIG_ENCODING_UNDEF ((OnigEncoding )0)
/* work size */
#define ONIGENC_CODE_TO_MBC_MAXLEN 7
#define ONIGENC_MBC_CASE_FOLD_MAXLEN 18
/* 18: 6(max-byte) * 3(case-fold chars) */
/* character types */
#define ONIGENC_CTYPE_NEWLINE 0
#define ONIGENC_CTYPE_ALPHA 1
#define ONIGENC_CTYPE_BLANK 2
#define ONIGENC_CTYPE_CNTRL 3
#define ONIGENC_CTYPE_DIGIT 4
#define ONIGENC_CTYPE_GRAPH 5
#define ONIGENC_CTYPE_LOWER 6
#define ONIGENC_CTYPE_PRINT 7
#define ONIGENC_CTYPE_PUNCT 8
#define ONIGENC_CTYPE_SPACE 9
#define ONIGENC_CTYPE_UPPER 10
#define ONIGENC_CTYPE_XDIGIT 11
#define ONIGENC_CTYPE_WORD 12
#define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */
#define ONIGENC_CTYPE_ASCII 14
#define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII
#define onig_enc_len(enc,p,end) ONIGENC_MBC_ENC_LEN(enc,p)
#define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF)
#define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1)
#define ONIGENC_IS_MBC_HEAD(enc,p) (ONIGENC_MBC_ENC_LEN(enc,p) != 1)
#define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128)
#define ONIGENC_IS_CODE_ASCII(code) ((code) < 128)
#define ONIGENC_IS_MBC_WORD(enc,s,end) \
ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE(enc,s,end))
#define ONIGENC_NAME(enc) ((enc)->name)
#define ONIGENC_MBC_CASE_FOLD(enc,flag,pp,end,buf) \
(enc)->mbc_case_fold(flag,(const OnigUChar** )pp,end,buf)
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \
(enc)->is_allowed_reverse_match(s,end)
#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \
(enc)->left_adjust_char_head(start, s)
#define ONIGENC_APPLY_ALL_CASE_FOLD(enc,case_fold_flag,f,arg) \
(enc)->apply_all_case_fold(case_fold_flag,f,arg)
#define ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc,case_fold_flag,p,end,acs) \
(enc)->get_case_fold_codes_by_str(case_fold_flag,p,end,acs)
#define ONIGENC_STEP_BACK(enc,start,s,n) \
onigenc_step_back((enc),(start),(s),(n))
#define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p)
#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len)
#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc)
#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len)
#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end))
#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end))
#define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code)
#define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf)
#define ONIGENC_PROPERTY_NAME_TO_CTYPE(enc,p,end) \
(enc)->property_name_to_ctype(enc,p,end)
#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype)
#define ONIGENC_IS_CODE_NEWLINE(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE)
#define ONIGENC_IS_CODE_GRAPH(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH)
#define ONIGENC_IS_CODE_PRINT(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PRINT)
#define ONIGENC_IS_CODE_ALNUM(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALNUM)
#define ONIGENC_IS_CODE_ALPHA(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALPHA)
#define ONIGENC_IS_CODE_LOWER(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_LOWER)
#define ONIGENC_IS_CODE_UPPER(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_UPPER)
#define ONIGENC_IS_CODE_CNTRL(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_CNTRL)
#define ONIGENC_IS_CODE_PUNCT(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PUNCT)
#define ONIGENC_IS_CODE_SPACE(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_SPACE)
#define ONIGENC_IS_CODE_BLANK(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_BLANK)
#define ONIGENC_IS_CODE_DIGIT(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_DIGIT)
#define ONIGENC_IS_CODE_XDIGIT(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_XDIGIT)
#define ONIGENC_IS_CODE_WORD(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_WORD)
#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,sbout,ranges) \
(enc)->get_ctype_code_range(ctype,sbout,ranges)
ONIG_EXTERN
OnigUChar* onigenc_step_back P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, int n));
/* encoding API */
ONIG_EXTERN
int onigenc_init P_((void));
ONIG_EXTERN
int onigenc_set_default_encoding P_((OnigEncoding enc));
ONIG_EXTERN
OnigEncoding onigenc_get_default_encoding P_((void));
ONIG_EXTERN
void onigenc_set_default_caseconv_table P_((const OnigUChar* table));
ONIG_EXTERN
OnigUChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar** prev));
ONIG_EXTERN
OnigUChar* onigenc_get_prev_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s));
ONIG_EXTERN
OnigUChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s));
ONIG_EXTERN
OnigUChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s));
ONIG_EXTERN
int onigenc_strlen P_((OnigEncoding enc, const OnigUChar* p, const OnigUChar* end));
ONIG_EXTERN
int onigenc_strlen_null P_((OnigEncoding enc, const OnigUChar* p));
ONIG_EXTERN
int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p));
/* PART: regular expression */
/* config parameters */
#define ONIG_NREGION 10
#define ONIG_MAX_BACKREF_NUM 1000
#define ONIG_MAX_REPEAT_NUM 100000
#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 10000
/* constants */
#define ONIG_MAX_ERROR_MESSAGE_LEN 90
typedef unsigned int OnigOptionType;
#define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE
/* options */
#define ONIG_OPTION_NONE 0U
#define ONIG_OPTION_IGNORECASE 1U
#define ONIG_OPTION_EXTEND (ONIG_OPTION_IGNORECASE << 1)
#define ONIG_OPTION_MULTILINE (ONIG_OPTION_EXTEND << 1)
#define ONIG_OPTION_SINGLELINE (ONIG_OPTION_MULTILINE << 1)
#define ONIG_OPTION_FIND_LONGEST (ONIG_OPTION_SINGLELINE << 1)
#define ONIG_OPTION_FIND_NOT_EMPTY (ONIG_OPTION_FIND_LONGEST << 1)
#define ONIG_OPTION_NEGATE_SINGLELINE (ONIG_OPTION_FIND_NOT_EMPTY << 1)
#define ONIG_OPTION_DONT_CAPTURE_GROUP (ONIG_OPTION_NEGATE_SINGLELINE << 1)
#define ONIG_OPTION_CAPTURE_GROUP (ONIG_OPTION_DONT_CAPTURE_GROUP << 1)
/* options (search time) */
#define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1)
#define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1)
#define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1)
#define ONIG_OPTION_MAXBIT ONIG_OPTION_POSIX_REGION /* limit */
#define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt))
#define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt))
#define ONIG_IS_OPTION_ON(options,option) ((options) & (option))
/* syntax */
typedef struct {
unsigned int op;
unsigned int op2;
unsigned int behavior;
OnigOptionType options; /* default option */
OnigMetaCharTableType meta_char_table;
} OnigSyntaxType;
ONIG_EXTERN OnigSyntaxType OnigSyntaxASIS;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixBasic;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixExtended;
ONIG_EXTERN OnigSyntaxType OnigSyntaxEmacs;
ONIG_EXTERN OnigSyntaxType OnigSyntaxGrep;
ONIG_EXTERN OnigSyntaxType OnigSyntaxGnuRegex;
ONIG_EXTERN OnigSyntaxType OnigSyntaxJava;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG;
ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby;
/* predefined syntaxes (see regsyntax.c) */
#define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS)
#define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic)
#define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended)
#define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs)
#define ONIG_SYNTAX_GREP (&OnigSyntaxGrep)
#define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex)
#define ONIG_SYNTAX_JAVA (&OnigSyntaxJava)
#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl)
#define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG)
#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby)
/* default syntax */
ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYNTAX_DEFAULT OnigDefaultSyntax
/* syntax (operators) */
#define ONIG_SYN_OP_VARIABLE_META_CHARACTERS (1U<<0)
#define ONIG_SYN_OP_DOT_ANYCHAR (1U<<1) /* . */
#define ONIG_SYN_OP_ASTERISK_ZERO_INF (1U<<2) /* * */
#define ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (1U<<3)
#define ONIG_SYN_OP_PLUS_ONE_INF (1U<<4) /* + */
#define ONIG_SYN_OP_ESC_PLUS_ONE_INF (1U<<5)
#define ONIG_SYN_OP_QMARK_ZERO_ONE (1U<<6) /* ? */
#define ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (1U<<7)
#define ONIG_SYN_OP_BRACE_INTERVAL (1U<<8) /* {lower,upper} */
#define ONIG_SYN_OP_ESC_BRACE_INTERVAL (1U<<9) /* \{lower,upper\} */
#define ONIG_SYN_OP_VBAR_ALT (1U<<10) /* | */
#define ONIG_SYN_OP_ESC_VBAR_ALT (1U<<11) /* \| */
#define ONIG_SYN_OP_LPAREN_SUBEXP (1U<<12) /* (...) */
#define ONIG_SYN_OP_ESC_LPAREN_SUBEXP (1U<<13) /* \(...\) */
#define ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (1U<<14) /* \A, \Z, \z */
#define ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (1U<<15) /* \G */
#define ONIG_SYN_OP_DECIMAL_BACKREF (1U<<16) /* \num */
#define ONIG_SYN_OP_BRACKET_CC (1U<<17) /* [...] */
#define ONIG_SYN_OP_ESC_W_WORD (1U<<18) /* \w, \W */
#define ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (1U<<19) /* \<. \> */
#define ONIG_SYN_OP_ESC_B_WORD_BOUND (1U<<20) /* \b, \B */
#define ONIG_SYN_OP_ESC_S_WHITE_SPACE (1U<<21) /* \s, \S */
#define ONIG_SYN_OP_ESC_D_DIGIT (1U<<22) /* \d, \D */
#define ONIG_SYN_OP_LINE_ANCHOR (1U<<23) /* ^, $ */
#define ONIG_SYN_OP_POSIX_BRACKET (1U<<24) /* [:xxxx:] */
#define ONIG_SYN_OP_QMARK_NON_GREEDY (1U<<25) /* ??,*?,+?,{n,m}? */
#define ONIG_SYN_OP_ESC_CONTROL_CHARS (1U<<26) /* \n,\r,\t,\a ... */
#define ONIG_SYN_OP_ESC_C_CONTROL (1U<<27) /* \cx */
#define ONIG_SYN_OP_ESC_OCTAL3 (1U<<28) /* \OOO */
#define ONIG_SYN_OP_ESC_X_HEX2 (1U<<29) /* \xHH */
#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1U<<30) /* \x{7HHHHHHH} */
#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1U<<0) /* \Q...\E */
#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1U<<1) /* (?...) */
#define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsx),(?-imsx) */
#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imx), (?-imx) */
#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1U<<4) /* ?+,*+,++ */
#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1U<<5) /* {n,m}+ */
#define ONIG_SYN_OP2_CCLASS_SET_OP (1U<<6) /* [...&&..[..]..] */
#define ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (1U<<7) /* (?<name>...) */
#define ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (1U<<8) /* \k<name> */
#define ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (1U<<9) /* \g<name>, \g<n> */
#define ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (1U<<10) /* (?@..),(?@<x>..) */
#define ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1U<<11) /* \C-x */
#define ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (1U<<12) /* \M-x */
#define ONIG_SYN_OP2_ESC_V_VTAB (1U<<13) /* \v as VTAB */
#define ONIG_SYN_OP2_ESC_U_HEX4 (1U<<14) /* \uHHHH */
#define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1U<<15) /* \`, \' */
#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1U<<16) /* \p{...}, \P{...} */
#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1U<<17) /* \p{^..}, \P{^..} */
/* #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1U<<18) */
#define ONIG_SYN_OP2_ESC_H_XDIGIT (1U<<19) /* \h, \H */
#define ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (1U<<20) /* \ */
/* syntax (behavior) */
#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */
#define ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (1U<<0) /* ?, *, +, {n,m} */
#define ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (1U<<1) /* error or ignore */
#define ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1U<<2) /* ...)... */
#define ONIG_SYN_ALLOW_INVALID_INTERVAL (1U<<3) /* {??? */
#define ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (1U<<4) /* {,n} => {0,n} */
#define ONIG_SYN_STRICT_CHECK_BACKREF (1U<<5) /* /(\1)/,/\1()/ ..*/
#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1U<<6) /* (?<=a|bc) */
#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1U<<7) /* see doc/RE */
#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?<x>)(?<x>) */
#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */
/* syntax (behavior) in char class [...] */
#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */
#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */
#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22)
#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */
/* syntax (behavior) warning */
#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */
#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */
/* meta character specifiers (onig_set_meta_char()) */
#define ONIG_META_CHAR_ESCAPE 0
#define ONIG_META_CHAR_ANYCHAR 1
#define ONIG_META_CHAR_ANYTIME 2
#define ONIG_META_CHAR_ZERO_OR_ONE_TIME 3
#define ONIG_META_CHAR_ONE_OR_MORE_TIME 4
#define ONIG_META_CHAR_ANYCHAR_ANYTIME 5
#define ONIG_INEFFECTIVE_META_CHAR 0
/* error codes */
#define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000)
/* normal return */
#define ONIG_NORMAL 0
#define ONIG_MISMATCH -1
#define ONIG_NO_SUPPORT_CONFIG -2
/* internal error */
#define ONIGERR_MEMORY -5
#define ONIGERR_TYPE_BUG -6
#define ONIGERR_PARSER_BUG -11
#define ONIGERR_STACK_BUG -12
#define ONIGERR_UNDEFINED_BYTECODE -13
#define ONIGERR_UNEXPECTED_BYTECODE -14
#define ONIGERR_MATCH_STACK_LIMIT_OVER -15
#define ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED -21
#define ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22
/* general error */
#define ONIGERR_INVALID_ARGUMENT -30
/* syntax error */
#define ONIGERR_END_PATTERN_AT_LEFT_BRACE -100
#define ONIGERR_END_PATTERN_AT_LEFT_BRACKET -101
#define ONIGERR_EMPTY_CHAR_CLASS -102
#define ONIGERR_PREMATURE_END_OF_CHAR_CLASS -103
#define ONIGERR_END_PATTERN_AT_ESCAPE -104
#define ONIGERR_END_PATTERN_AT_META -105
#define ONIGERR_END_PATTERN_AT_CONTROL -106
#define ONIGERR_META_CODE_SYNTAX -108
#define ONIGERR_CONTROL_CODE_SYNTAX -109
#define ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE -110
#define ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE -111
#define ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS -112
#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED -113
#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID -114
#define ONIGERR_NESTED_REPEAT_OPERATOR -115
#define ONIGERR_UNMATCHED_CLOSE_PARENTHESIS -116
#define ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS -117
#define ONIGERR_END_PATTERN_IN_GROUP -118
#define ONIGERR_UNDEFINED_GROUP_OPTION -119
#define ONIGERR_INVALID_POSIX_BRACKET_TYPE -121
#define ONIGERR_INVALID_LOOK_BEHIND_PATTERN -122
#define ONIGERR_INVALID_REPEAT_RANGE_PATTERN -123
/* values error (syntax error) */
#define ONIGERR_TOO_BIG_NUMBER -200
#define ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201
#define ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE -202
#define ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS -203
#define ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE -204
#define ONIGERR_TOO_MANY_MULTI_BYTE_RANGES -205
#define ONIGERR_TOO_SHORT_MULTI_BYTE_STRING -206
#define ONIGERR_TOO_BIG_BACKREF_NUMBER -207
#define ONIGERR_INVALID_BACKREF -208
#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209
#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212
#define ONIGERR_EMPTY_GROUP_NAME -214
#define ONIGERR_INVALID_GROUP_NAME -215
#define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216
#define ONIGERR_UNDEFINED_NAME_REFERENCE -217
#define ONIGERR_UNDEFINED_GROUP_REFERENCE -218
#define ONIGERR_MULTIPLEX_DEFINED_NAME -219
#define ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL -220
#define ONIGERR_NEVER_ENDING_RECURSION -221
#define ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY -222
#define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223
#define ONIGERR_INVALID_CODE_POINT_VALUE -400
#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401
#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402
#define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403
/* errors related to thread */
#define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001
/* must be smaller than BIT_STATUS_BITS_NUM (unsigned int * 8) */
#define ONIG_MAX_CAPTURE_HISTORY_GROUP 31
#define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \
((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i])
typedef struct OnigCaptureTreeNodeStruct {
int group; /* group number */
int beg;
int end;
int allocated;
int num_childs;
struct OnigCaptureTreeNodeStruct** childs;
} OnigCaptureTreeNode;
/* match result region type */
struct re_registers {
int allocated;
int num_regs;
int* beg;
int* end;
/* extended */
OnigCaptureTreeNode* history_root; /* capture history tree root */
};
/* capture tree traverse */
#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1
#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2
#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \
( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST )
#define ONIG_REGION_NOTPOS -1
typedef struct re_registers OnigRegion;
typedef struct {
OnigEncoding enc;
OnigUChar* par;
OnigUChar* par_end;
} OnigErrorInfo;
typedef struct {
int lower;
int upper;
} OnigRepeatRange;
typedef void (*OnigWarnFunc) P_((const char* s));
extern void onig_null_warn P_((const char* s));
#define ONIG_NULL_WARN onig_null_warn
#define ONIG_CHAR_TABLE_SIZE 256
/* regex_t state */
#define ONIG_STATE_NORMAL 0
#define ONIG_STATE_SEARCHING 1
#define ONIG_STATE_COMPILING -1
#define ONIG_STATE_MODIFY -2
#define ONIG_STATE(reg) \
((reg)->state > 0 ? ONIG_STATE_SEARCHING : (reg)->state)
typedef struct re_pattern_buffer {
/* common members of BBuf(bytes-buffer) */
unsigned char* p; /* compiled pattern */
unsigned int used; /* used space for p */
unsigned int alloc; /* allocated space for p */
int state; /* normal, searching, compiling */
int num_mem; /* used memory(...) num counted from 1 */
int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */
int num_null_check; /* OP_NULL_CHECK_START/END id counter */
int num_comb_exp_check; /* combination explosion check */
int num_call; /* number of subexp call */
unsigned int capture_history; /* (?@...) flag (1-31) */
unsigned int bt_mem_start; /* need backtrack flag */
unsigned int bt_mem_end; /* need backtrack flag */
int stack_pop_level;
int repeat_range_alloc;
OnigRepeatRange* repeat_range;
OnigEncoding enc;
OnigOptionType options;
OnigSyntaxType* syntax;
OnigCaseFoldType case_fold_flag;
void* name_table;
/* optimization info (string search, char-map and anchors) */
int optimize; /* optimize flag */
int threshold_len; /* search str-length for apply optimize */
int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */
OnigDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */
OnigDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */
int sub_anchor; /* start-anchor for exact or map */
unsigned char *exact;
unsigned char *exact_end;
unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */
int *int_map; /* BM skip for exact_len > 255 */
int *int_map_backward; /* BM skip for backward search */
OnigDistance dmin; /* min-distance of exact or map */
OnigDistance dmax; /* max-distance of exact or map */
/* regex_t link chain */
struct re_pattern_buffer* chain; /* escape compile-conflict */
} OnigRegexType;
typedef OnigRegexType* OnigRegex;
#ifndef ONIG_ESCAPE_REGEX_T_COLLISION
typedef OnigRegexType regex_t;
#endif
typedef struct {
int num_of_elements;
OnigEncoding pattern_enc;
OnigEncoding target_enc;
OnigSyntaxType* syntax;
OnigOptionType option;
OnigCaseFoldType case_fold_flag;
} OnigCompileInfo;
/* Oniguruma Native API */
ONIG_EXTERN
int onig_init P_((void));
ONIG_EXTERN
int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...));
ONIG_EXTERN
void onig_set_warn_func P_((OnigWarnFunc f));
ONIG_EXTERN
void onig_set_verb_warn_func P_((OnigWarnFunc f));
ONIG_EXTERN
int onig_new P_((OnigRegex*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo));
ONIG_EXTERN
int onig_reg_init P_((regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, OnigSyntaxType* syntax));
int onig_new_without_alloc P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo));
ONIG_EXTERN
int onig_new_deluxe P_((OnigRegex* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo));
ONIG_EXTERN
void onig_free P_((OnigRegex));
ONIG_EXTERN
void onig_free_body P_((OnigRegex));
ONIG_EXTERN
int onig_recompile P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo));
ONIG_EXTERN
int onig_recompile_deluxe P_((OnigRegex reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo));
ONIG_EXTERN
int onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option));
ONIG_EXTERN
int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option));
ONIG_EXTERN
OnigRegion* onig_region_new P_((void));
ONIG_EXTERN
void onig_region_init P_((OnigRegion* region));
ONIG_EXTERN
void onig_region_free P_((OnigRegion* region, int free_self));
ONIG_EXTERN
void onig_region_copy P_((OnigRegion* to, OnigRegion* from));
ONIG_EXTERN
void onig_region_clear P_((OnigRegion* region));
ONIG_EXTERN
int onig_region_resize P_((OnigRegion* region, int n));
ONIG_EXTERN
int onig_region_set P_((OnigRegion* region, int at, int beg, int end));
ONIG_EXTERN
int onig_name_to_group_numbers P_((OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, int** nums));
ONIG_EXTERN
int onig_name_to_backref_number P_((OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, OnigRegion *region));
ONIG_EXTERN
int onig_foreach_name P_((OnigRegex reg, int (*func)(const OnigUChar*, const OnigUChar*,int,int*,OnigRegex,void*), void* arg));
ONIG_EXTERN
int onig_number_of_names P_((OnigRegex reg));
ONIG_EXTERN
int onig_number_of_captures P_((OnigRegex reg));
ONIG_EXTERN
int onig_number_of_capture_histories P_((OnigRegex reg));
ONIG_EXTERN
OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region));
ONIG_EXTERN
int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg));
ONIG_EXTERN
int onig_noname_group_capture_is_active P_((OnigRegex reg));
ONIG_EXTERN
OnigEncoding onig_get_encoding P_((OnigRegex reg));
ONIG_EXTERN
OnigOptionType onig_get_options P_((OnigRegex reg));
ONIG_EXTERN
OnigCaseFoldType onig_get_case_fold_flag P_((OnigRegex reg));
ONIG_EXTERN
OnigSyntaxType* onig_get_syntax P_((OnigRegex reg));
ONIG_EXTERN
int onig_set_default_syntax P_((OnigSyntaxType* syntax));
ONIG_EXTERN
void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from));
ONIG_EXTERN
unsigned int onig_get_syntax_op P_((OnigSyntaxType* syntax));
ONIG_EXTERN
unsigned int onig_get_syntax_op2 P_((OnigSyntaxType* syntax));
ONIG_EXTERN
unsigned int onig_get_syntax_behavior P_((OnigSyntaxType* syntax));
ONIG_EXTERN
OnigOptionType onig_get_syntax_options P_((OnigSyntaxType* syntax));
ONIG_EXTERN
void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op));
ONIG_EXTERN
void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2));
ONIG_EXTERN
void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior));
ONIG_EXTERN
void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options));
ONIG_EXTERN
int onig_set_meta_char P_((OnigSyntaxType* syntax, unsigned int what, OnigCodePoint code));
ONIG_EXTERN
void onig_copy_encoding P_((OnigEncoding to, OnigEncoding from));
ONIG_EXTERN
OnigCaseFoldType onig_get_default_case_fold_flag P_((void));
ONIG_EXTERN
int onig_set_default_case_fold_flag P_((OnigCaseFoldType case_fold_flag));
ONIG_EXTERN
unsigned int onig_get_match_stack_limit_size P_((void));
ONIG_EXTERN
int onig_set_match_stack_limit_size P_((unsigned int size));
ONIG_EXTERN
int onig_end P_((void));
ONIG_EXTERN
const char* onig_version P_((void));
ONIG_EXTERN
const char* onig_copyright P_((void));
#ifdef __cplusplus
}
#endif
#endif /* ONIGURUMA_H */

File diff suppressed because it is too large Load Diff

View File

@@ -1,905 +0,0 @@
/**********************************************************************
regenc.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
OnigEncoding OnigEncDefaultCharEncoding = NULL;
extern int
onigenc_init(void)
{
return 0;
}
extern OnigEncoding
onigenc_get_default_encoding(void)
{
if (OnigEncDefaultCharEncoding == NULL)
OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
return OnigEncDefaultCharEncoding;
}
extern int
onigenc_set_default_encoding(OnigEncoding enc)
{
OnigEncDefaultCharEncoding = enc;
return 0;
}
extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
{
UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
if (p < s) {
p += enclen(enc, p);
}
return p;
}
extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
const UChar* start, const UChar* s, const UChar** prev)
{
UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
if (p < s) {
if (prev) *prev = (const UChar* )p;
p += enclen(enc, p);
}
else {
if (prev) *prev = (const UChar* )NULL; /* Sorry */
}
return p;
}
extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
{
if (s <= start)
return (UChar* )NULL;
return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
}
extern UChar*
onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
{
while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
if (s <= start)
return (UChar* )NULL;
s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
}
return (UChar* )s;
}
extern UChar*
onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
{
UChar* q = (UChar* )p;
while (n-- > 0) {
q += ONIGENC_MBC_ENC_LEN(enc, q);
}
return (q <= end ? q : NULL);
}
extern int
onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
{
int n = 0;
UChar* q = (UChar* )p;
while (q < end) {
q += ONIGENC_MBC_ENC_LEN(enc, q);
n++;
}
return n;
}
extern int
onigenc_strlen_null(OnigEncoding enc, const UChar* s)
{
int n = 0;
UChar* p = (UChar* )s;
while (1) {
if (*p == '\0') {
UChar* q;
int len = ONIGENC_MBC_MINLEN(enc);
if (len == 1) return n;
q = p + 1;
while (len > 1) {
if (*q != '\0') break;
q++;
len--;
}
if (len == 1) return n;
}
p += ONIGENC_MBC_ENC_LEN(enc, p);
n++;
}
}
extern int
onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
{
UChar* start = (UChar* )s;
UChar* p = (UChar* )s;
while (1) {
if (*p == '\0') {
UChar* q;
int len = ONIGENC_MBC_MINLEN(enc);
if (len == 1) return (int )(p - start);
q = p + 1;
while (len > 1) {
if (*q != '\0') break;
q++;
len--;
}
if (len == 1) return (int )(p - start);
}
p += ONIGENC_MBC_ENC_LEN(enc, p);
}
}
const UChar OnigEncAsciiToLowerCaseTable[] = {
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
'\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
'\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
'\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
'\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
};
#ifdef USE_UPPER_CASE_TABLE
const UChar OnigEncAsciiToUpperCaseTable[256] = {
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
'\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
'\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
'\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
'\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
};
#endif
const unsigned short OnigEncAsciiCtypeTable[256] = {
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
};
const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
'\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
'\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
'\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
'\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
};
#ifdef USE_UPPER_CASE_TABLE
const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
'\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
'\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
'\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
'\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
};
#endif
extern void
onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
{
/* nothing */
/* obsoleted. */
}
extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
{
return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
}
const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
{ 0x41, 0x61 },
{ 0x42, 0x62 },
{ 0x43, 0x63 },
{ 0x44, 0x64 },
{ 0x45, 0x65 },
{ 0x46, 0x66 },
{ 0x47, 0x67 },
{ 0x48, 0x68 },
{ 0x49, 0x69 },
{ 0x4a, 0x6a },
{ 0x4b, 0x6b },
{ 0x4c, 0x6c },
{ 0x4d, 0x6d },
{ 0x4e, 0x6e },
{ 0x4f, 0x6f },
{ 0x50, 0x70 },
{ 0x51, 0x71 },
{ 0x52, 0x72 },
{ 0x53, 0x73 },
{ 0x54, 0x74 },
{ 0x55, 0x75 },
{ 0x56, 0x76 },
{ 0x57, 0x77 },
{ 0x58, 0x78 },
{ 0x59, 0x79 },
{ 0x5a, 0x7a }
};
extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
OnigApplyAllCaseFoldFunc f, void* arg)
{
OnigCodePoint code;
int i, r;
for (i = 0;
i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
i++) {
code = OnigAsciiLowerMap[i].to;
r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
if (r != 0) return r;
code = OnigAsciiLowerMap[i].from;
r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
if (r != 0) return r;
}
return 0;
}
extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
OnigCaseFoldCodeItem items[])
{
if (0x41 <= *p && *p <= 0x5a) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p + 0x20);
return 1;
}
else if (0x61 <= *p && *p <= 0x7a) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
else
return 0;
}
static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
OnigApplyAllCaseFoldFunc f, void* arg)
{
static OnigCodePoint ss[] = { 0x73, 0x73 };
return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
}
extern int
onigenc_apply_all_case_fold_with_map(int map_size,
const OnigPairCaseFoldCodes map[],
int ess_tsett_flag, OnigCaseFoldType flag,
OnigApplyAllCaseFoldFunc f, void* arg)
{
OnigCodePoint code;
int i, r;
r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
if (r != 0) return r;
for (i = 0; i < map_size; i++) {
code = map[i].to;
r = (*f)(map[i].from, &code, 1, arg);
if (r != 0) return r;
code = map[i].from;
r = (*f)(map[i].to, &code, 1, arg);
if (r != 0) return r;
}
if (ess_tsett_flag != 0)
return ss_apply_all_case_fold(flag, f, arg);
return 0;
}
extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,
const OnigPairCaseFoldCodes map[],
int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
{
if (0x41 <= *p && *p <= 0x5a) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p + 0x20);
if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
&& (*(p+1) == 0x53 || *(p+1) == 0x73)) {
/* SS */
items[1].byte_len = 2;
items[1].code_len = 1;
items[1].code[0] = (OnigCodePoint )0xdf;
return 2;
}
else
return 1;
}
else if (0x61 <= *p && *p <= 0x7a) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
&& (*(p+1) == 0x73 || *(p+1) == 0x53)) {
/* ss */
items[1].byte_len = 2;
items[1].code_len = 1;
items[1].code[0] = (OnigCodePoint )0xdf;
return 2;
}
else
return 1;
}
else if (*p == 0xdf && ess_tsett_flag != 0) {
items[0].byte_len = 1;
items[0].code_len = 2;
items[0].code[0] = (OnigCodePoint )'s';
items[0].code[1] = (OnigCodePoint )'s';
items[1].byte_len = 1;
items[1].code_len = 2;
items[1].code[0] = (OnigCodePoint )'S';
items[1].code[1] = (OnigCodePoint )'S';
items[2].byte_len = 1;
items[2].code_len = 2;
items[2].code[0] = (OnigCodePoint )'s';
items[2].code[1] = (OnigCodePoint )'S';
items[3].byte_len = 1;
items[3].code_len = 2;
items[3].code[0] = (OnigCodePoint )'S';
items[3].code[1] = (OnigCodePoint )'s';
return 4;
}
else {
int i;
for (i = 0; i < map_size; i++) {
if (*p == map[i].from) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = map[i].to;
return 1;
}
else if (*p == map[i].to) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = map[i].from;
return 1;
}
}
}
return 0;
}
extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
OnigCodePoint* sb_out ARG_UNUSED,
const OnigCodePoint* ranges[] ARG_UNUSED)
{
return ONIG_NO_SUPPORT_CONFIG;
}
extern int
onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
{
if (p < end) {
if (*p == 0x0a) return 1;
}
return 0;
}
/* for single byte encodings */
extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
const UChar*end ARG_UNUSED, UChar* lower)
{
*lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
(*p)++;
return 1; /* return byte length of converted char to lower */
}
#if 0
extern int
onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag,
const UChar** pp, const UChar* end)
{
const UChar* p = *pp;
(*pp)++;
return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
}
#endif
extern int
onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
{
return 1;
}
extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
{
return (OnigCodePoint )(*p);
}
extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
{
return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
}
extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
{
*buf = (UChar )(code & 0xff);
return 1;
}
extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
const UChar* s)
{
return (UChar* )s;
}
extern int
onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
const UChar* end ARG_UNUSED)
{
return TRUE;
}
extern int
onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
const UChar* end ARG_UNUSED)
{
return FALSE;
}
extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
{
int c, i, len;
OnigCodePoint n;
len = enclen(enc, p);
n = (OnigCodePoint )(*p++);
if (len == 1) return n;
for (i = 1; i < len; i++) {
if (p >= end) break;
c = *p++;
n <<= 8; n += c;
}
return n;
}
extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
const UChar** pp, const UChar* end ARG_UNUSED,
UChar* lower)
{
int len;
const UChar *p = *pp;
if (ONIGENC_IS_MBC_ASCII(p)) {
*lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
(*pp)++;
return 1;
}
else {
int i;
len = enclen(enc, p);
for (i = 0; i < len; i++) {
*lower++ = *p++;
}
(*pp) += len;
return len; /* return byte length of converted to lower char */
}
}
#if 0
extern int
onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
const UChar** pp, const UChar* end)
{
const UChar* p = *pp;
if (ONIGENC_IS_MBC_ASCII(p)) {
(*pp)++;
return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
}
(*pp) += enclen(enc, p);
return FALSE;
}
#endif
extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)
{
if ((code & 0xff00) != 0) return 2;
else return 1;
}
extern int
onigenc_mb4_code_to_mbclen(OnigCodePoint code)
{
if ((code & 0xff000000) != 0) return 4;
else if ((code & 0xff0000) != 0) return 3;
else if ((code & 0xff00) != 0) return 2;
else return 1;
}
extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
{
UChar *p = buf;
if ((code & 0xff00) != 0) {
*p++ = (UChar )((code >> 8) & 0xff);
}
*p++ = (UChar )(code & 0xff);
#if 1
if (enclen(enc, buf) != (p - buf))
return ONIGERR_INVALID_CODE_POINT_VALUE;
#endif
return p - buf;
}
extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
{
UChar *p = buf;
if ((code & 0xff000000) != 0) {
*p++ = (UChar )((code >> 24) & 0xff);
}
if ((code & 0xff0000) != 0 || p != buf) {
*p++ = (UChar )((code >> 16) & 0xff);
}
if ((code & 0xff00) != 0 || p != buf) {
*p++ = (UChar )((code >> 8) & 0xff);
}
*p++ = (UChar )(code & 0xff);
#if 1
if (enclen(enc, buf) != (p - buf))
return ONIGERR_INVALID_CODE_POINT_VALUE;
#endif
return p - buf;
}
extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
{
static PosixBracketEntryType PBS[] = {
{ (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
{ (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
{ (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
{ (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
{ (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
{ (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
{ (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
{ (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
{ (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
{ (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
{ (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
{ (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
{ (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
{ (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
{ (UChar* )NULL, -1, 0 }
};
PosixBracketEntryType *pb;
int len;
len = onigenc_strlen(enc, p, end);
for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
if (len == pb->len &&
onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
return pb->ctype;
}
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
}
extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
unsigned int ctype)
{
if (code < 128)
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
else {
if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
}
}
return FALSE;
}
extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
unsigned int ctype)
{
if (code < 128)
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
else {
if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
}
}
return FALSE;
}
extern int
onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
const UChar* sascii /* ascii */, int n)
{
int x, c;
while (n-- > 0) {
if (p >= end) return (int )(*sascii);
c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
x = *sascii - c;
if (x) return x;
sascii++;
p += enclen(enc, p);
}
return 0;
}
/* Property management */
static int
resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
{
int size;
const OnigCodePoint **list = *plist;
size = sizeof(OnigCodePoint*) * new_size;
if (IS_NULL(list)) {
list = (const OnigCodePoint** )xmalloc(size);
}
else {
list = (const OnigCodePoint** )xrealloc((void* )list, size);
}
if (IS_NULL(list)) return ONIGERR_MEMORY;
*plist = list;
*psize = new_size;
return 0;
}
extern int
onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
int *psize)
{
#define PROP_INIT_SIZE 16
int r;
if (*psize <= *pnum) {
int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
r = resize_property_list(new_size, plist, psize);
if (r != 0) return r;
}
(*plist)[*pnum] = prop;
if (ONIG_IS_NULL(*table)) {
*table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
}
*pnum = *pnum + 1;
onig_st_insert_strend(*table, name, name + strlen((char* )name),
(hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
return 0;
}
extern int
onigenc_property_list_init(int (*f)(void))
{
int r;
THREAD_ATOMIC_START;
r = f();
THREAD_ATOMIC_END;
return r;
}

View File

@@ -1,189 +0,0 @@
#ifndef REGENC_H
#define REGENC_H
/**********************************************************************
regenc.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef PACKAGE
/* PACKAGE is defined in config.h */
#include "onig_config.h"
#endif
#ifdef ONIG_ESCAPE_UCHAR_COLLISION
#undef ONIG_ESCAPE_UCHAR_COLLISION
#endif
#include "oniguruma.h"
typedef struct {
OnigCodePoint from;
OnigCodePoint to;
} OnigPairCaseFoldCodes;
#ifndef NULL
#define NULL ((void* )0)
#endif
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#ifndef ARG_UNUSED
#if defined(__GNUC__)
# define ARG_UNUSED __attribute__ ((unused))
#else
# define ARG_UNUSED
#endif
#endif
#define ONIG_IS_NULL(p) (((void*)(p)) == (void*)0)
#define ONIG_IS_NOT_NULL(p) (((void*)(p)) != (void*)0)
#define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL
#define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val)
#define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p)
/* character types bit flag */
#define BIT_CTYPE_NEWLINE (1<< ONIGENC_CTYPE_NEWLINE)
#define BIT_CTYPE_ALPHA (1<< ONIGENC_CTYPE_ALPHA)
#define BIT_CTYPE_BLANK (1<< ONIGENC_CTYPE_BLANK)
#define BIT_CTYPE_CNTRL (1<< ONIGENC_CTYPE_CNTRL)
#define BIT_CTYPE_DIGIT (1<< ONIGENC_CTYPE_DIGIT)
#define BIT_CTYPE_GRAPH (1<< ONIGENC_CTYPE_GRAPH)
#define BIT_CTYPE_LOWER (1<< ONIGENC_CTYPE_LOWER)
#define BIT_CTYPE_PRINT (1<< ONIGENC_CTYPE_PRINT)
#define BIT_CTYPE_PUNCT (1<< ONIGENC_CTYPE_PUNCT)
#define BIT_CTYPE_SPACE (1<< ONIGENC_CTYPE_SPACE)
#define BIT_CTYPE_UPPER (1<< ONIGENC_CTYPE_UPPER)
#define BIT_CTYPE_XDIGIT (1<< ONIGENC_CTYPE_XDIGIT)
#define BIT_CTYPE_WORD (1<< ONIGENC_CTYPE_WORD)
#define BIT_CTYPE_ALNUM (1<< ONIGENC_CTYPE_ALNUM)
#define BIT_CTYPE_ASCII (1<< ONIGENC_CTYPE_ASCII)
#define CTYPE_TO_BIT(ctype) (1<<(ctype))
#define CTYPE_IS_WORD_GRAPH_PRINT(ctype) \
((ctype) == ONIGENC_CTYPE_WORD || (ctype) == ONIGENC_CTYPE_GRAPH ||\
(ctype) == ONIGENC_CTYPE_PRINT)
typedef struct {
UChar *name;
int ctype;
short int len;
} PosixBracketEntryType;
/* #define USE_CRNL_AS_LINE_TERMINATOR */
#define USE_UNICODE_PROPERTIES
/* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */
/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTF#18 */
#define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII
/* for encoding system implementation (internal) */
ONIG_EXTERN int onigenc_ascii_apply_all_case_fold P_((OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg));
ONIG_EXTERN int onigenc_ascii_get_case_fold_codes_by_str P_((OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]));
ONIG_EXTERN int onigenc_apply_all_case_fold_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg));
ONIG_EXTERN int onigenc_get_case_fold_codes_by_str_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]));
ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[]));
ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end));
/* methods for single byte encoding */
ONIG_EXTERN int onigenc_ascii_mbc_case_fold P_((OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower));
ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((const UChar* p));
ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((const UChar* p, const UChar* end));
ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code));
ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf));
ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s));
ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((const UChar* s, const UChar* end));
ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s, const UChar* end));
/* methods for multi byte encoding */
ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end));
ONIG_EXTERN int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower));
ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code));
ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf));
ONIG_EXTERN int onigenc_minimum_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end));
ONIG_EXTERN int onigenc_unicode_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end));
ONIG_EXTERN int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype));
ONIG_EXTERN int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code));
ONIG_EXTERN int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf));
ONIG_EXTERN int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype));
/* in enc/unicode.c */
ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype));
ONIG_EXTERN int onigenc_utf16_32_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint* ranges[]));
ONIG_EXTERN int onigenc_unicode_ctype_code_range P_((int ctype, const OnigCodePoint* ranges[]));
ONIG_EXTERN int onigenc_unicode_get_case_fold_codes_by_str P_((OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]));
ONIG_EXTERN int onigenc_unicode_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** pp, const UChar* end, UChar* fold));
ONIG_EXTERN int onigenc_unicode_apply_all_case_fold P_((OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg));
#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
#define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \
OnigEncISO_8859_1_ToLowerCaseTable[c]
#define ONIGENC_ISO_8859_1_TO_UPPER_CASE(c) \
OnigEncISO_8859_1_ToUpperCaseTable[c]
ONIG_EXTERN const UChar OnigEncISO_8859_1_ToLowerCaseTable[];
ONIG_EXTERN const UChar OnigEncISO_8859_1_ToUpperCaseTable[];
ONIG_EXTERN int
onigenc_with_ascii_strncmp P_((OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n));
ONIG_EXTERN UChar*
onigenc_step P_((OnigEncoding enc, const UChar* p, const UChar* end, int n));
/* defined in regexec.c, but used in enc/xxx.c */
extern int onig_is_in_code_range P_((const UChar* p, OnigCodePoint code));
ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding;
ONIG_EXTERN const UChar OnigEncAsciiToLowerCaseTable[];
ONIG_EXTERN const UChar OnigEncAsciiToUpperCaseTable[];
ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[];
#define ONIGENC_IS_ASCII_CODE(code) ((code) < 0x80)
#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c]
#define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c]
#define ONIGENC_IS_ASCII_CODE_CTYPE(code,ctype) \
((OnigEncAsciiCtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
#define ONIGENC_IS_ASCII_CODE_CASE_AMBIG(code) \
(ONIGENC_IS_ASCII_CODE_CTYPE(code, ONIGENC_CTYPE_UPPER) ||\
ONIGENC_IS_ASCII_CODE_CTYPE(code, ONIGENC_CTYPE_LOWER))
#endif /* REGENC_H */

View File

@@ -1,387 +0,0 @@
/**********************************************************************
regerror.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
#include <stdio.h> /* for vsnprintf() */
#ifdef HAVE_STDARG_PROTOTYPES
#include <stdarg.h>
#define va_init_list(a,b) va_start(a,b)
#else
#include <varargs.h>
#define va_init_list(a,b) va_start(a)
#endif
extern UChar*
onig_error_code_to_format(int code)
{
char *p;
if (code >= 0) return (UChar* )0;
switch (code) {
case ONIG_MISMATCH:
p = "mismatch"; break;
case ONIG_NO_SUPPORT_CONFIG:
p = "no support in this configuration"; break;
case ONIGERR_MEMORY:
p = "fail to memory allocation"; break;
case ONIGERR_MATCH_STACK_LIMIT_OVER:
p = "match-stack limit over"; break;
case ONIGERR_TYPE_BUG:
p = "undefined type (bug)"; break;
case ONIGERR_PARSER_BUG:
p = "internal parser error (bug)"; break;
case ONIGERR_STACK_BUG:
p = "stack error (bug)"; break;
case ONIGERR_UNDEFINED_BYTECODE:
p = "undefined bytecode (bug)"; break;
case ONIGERR_UNEXPECTED_BYTECODE:
p = "unexpected bytecode (bug)"; break;
case ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED:
p = "default multibyte-encoding is not setted"; break;
case ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR:
p = "can't convert to wide-char on specified multibyte-encoding"; break;
case ONIGERR_INVALID_ARGUMENT:
p = "invalid argument"; break;
case ONIGERR_END_PATTERN_AT_LEFT_BRACE:
p = "end pattern at left brace"; break;
case ONIGERR_END_PATTERN_AT_LEFT_BRACKET:
p = "end pattern at left bracket"; break;
case ONIGERR_EMPTY_CHAR_CLASS:
p = "empty char-class"; break;
case ONIGERR_PREMATURE_END_OF_CHAR_CLASS:
p = "premature end of char-class"; break;
case ONIGERR_END_PATTERN_AT_ESCAPE:
p = "end pattern at escape"; break;
case ONIGERR_END_PATTERN_AT_META:
p = "end pattern at meta"; break;
case ONIGERR_END_PATTERN_AT_CONTROL:
p = "end pattern at control"; break;
case ONIGERR_META_CODE_SYNTAX:
p = "invalid meta-code syntax"; break;
case ONIGERR_CONTROL_CODE_SYNTAX:
p = "invalid control-code syntax"; break;
case ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE:
p = "char-class value at end of range"; break;
case ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE:
p = "char-class value at start of range"; break;
case ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS:
p = "unmatched range specifier in char-class"; break;
case ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED:
p = "target of repeat operator is not specified"; break;
case ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID:
p = "target of repeat operator is invalid"; break;
case ONIGERR_NESTED_REPEAT_OPERATOR:
p = "nested repeat operator"; break;
case ONIGERR_UNMATCHED_CLOSE_PARENTHESIS:
p = "unmatched close parenthesis"; break;
case ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS:
p = "end pattern with unmatched parenthesis"; break;
case ONIGERR_END_PATTERN_IN_GROUP:
p = "end pattern in group"; break;
case ONIGERR_UNDEFINED_GROUP_OPTION:
p = "undefined group option"; break;
case ONIGERR_INVALID_POSIX_BRACKET_TYPE:
p = "invalid POSIX bracket type"; break;
case ONIGERR_INVALID_LOOK_BEHIND_PATTERN:
p = "invalid pattern in look-behind"; break;
case ONIGERR_INVALID_REPEAT_RANGE_PATTERN:
p = "invalid repeat range {lower,upper}"; break;
case ONIGERR_TOO_BIG_NUMBER:
p = "too big number"; break;
case ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE:
p = "too big number for repeat range"; break;
case ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE:
p = "upper is smaller than lower in repeat range"; break;
case ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS:
p = "empty range in char class"; break;
case ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE:
p = "mismatch multibyte code length in char-class range"; break;
case ONIGERR_TOO_MANY_MULTI_BYTE_RANGES:
p = "too many multibyte code ranges are specified"; break;
case ONIGERR_TOO_SHORT_MULTI_BYTE_STRING:
p = "too short multibyte code string"; break;
case ONIGERR_TOO_BIG_BACKREF_NUMBER:
p = "too big backref number"; break;
case ONIGERR_INVALID_BACKREF:
#ifdef USE_NAMED_GROUP
p = "invalid backref number/name"; break;
#else
p = "invalid backref number"; break;
#endif
case ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED:
p = "numbered backref/call is not allowed. (use name)"; break;
case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
p = "too big wide-char value"; break;
case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE:
p = "too long wide-char value"; break;
case ONIGERR_INVALID_CODE_POINT_VALUE:
p = "invalid code point value"; break;
case ONIGERR_EMPTY_GROUP_NAME:
p = "group name is empty"; break;
case ONIGERR_INVALID_GROUP_NAME:
p = "invalid group name <%n>"; break;
case ONIGERR_INVALID_CHAR_IN_GROUP_NAME:
#ifdef USE_NAMED_GROUP
p = "invalid char in group name <%n>"; break;
#else
p = "invalid char in group number <%n>"; break;
#endif
case ONIGERR_UNDEFINED_NAME_REFERENCE:
p = "undefined name <%n> reference"; break;
case ONIGERR_UNDEFINED_GROUP_REFERENCE:
p = "undefined group <%n> reference"; break;
case ONIGERR_MULTIPLEX_DEFINED_NAME:
p = "multiplex defined name <%n>"; break;
case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL:
p = "multiplex definition name <%n> call"; break;
case ONIGERR_NEVER_ENDING_RECURSION:
p = "never ending recursion"; break;
case ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY:
p = "group number is too big for capture history"; break;
case ONIGERR_INVALID_CHAR_PROPERTY_NAME:
p = "invalid character property name {%n}"; break;
case ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION:
p = "not supported encoding combination"; break;
case ONIGERR_INVALID_COMBINATION_OF_OPTIONS:
p = "invalid combination of options"; break;
case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT:
p = "over thread pass limit count"; break;
default:
p = "undefined error code"; break;
}
return (UChar* )p;
}
static void sprint_byte(char* s, unsigned int v)
{
sprintf(s, "%02x", (v & 0377));
}
static void sprint_byte_with_x(char* s, unsigned int v)
{
sprintf(s, "\\x%02x", (v & 0377));
}
static int to_ascii(OnigEncoding enc, UChar *s, UChar *end,
UChar buf[], int buf_size, int *is_over)
{
int len;
UChar *p;
OnigCodePoint code;
if (ONIGENC_MBC_MINLEN(enc) > 1) {
p = s;
len = 0;
while (p < end) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
if (code >= 0x80) {
if (code > 0xffff && len + 10 <= buf_size) {
sprint_byte_with_x((char*)(&(buf[len])), (unsigned int)(code >> 24));
sprint_byte((char*)(&(buf[len+4])), (unsigned int)(code >> 16));
sprint_byte((char*)(&(buf[len+6])), (unsigned int)(code >> 8));
sprint_byte((char*)(&(buf[len+8])), (unsigned int)code);
len += 10;
}
else if (len + 6 <= buf_size) {
sprint_byte_with_x((char*)(&(buf[len])), (unsigned int)(code >> 8));
sprint_byte((char*)(&(buf[len+4])), (unsigned int)code);
len += 6;
}
else {
break;
}
}
else {
buf[len++] = (UChar )code;
}
p += enclen(enc, p);
if (len >= buf_size) break;
}
*is_over = ((p < end) ? 1 : 0);
}
else {
len = MIN((end - s), buf_size);
xmemcpy(buf, s, (size_t )len);
*is_over = ((buf_size < (end - s)) ? 1 : 0);
}
return len;
}
/* for ONIG_MAX_ERROR_MESSAGE_LEN */
#define MAX_ERROR_PAR_LEN 30
extern int
#ifdef HAVE_STDARG_PROTOTYPES
onig_error_code_to_str(UChar* s, int code, ...)
#else
onig_error_code_to_str(s, code, va_alist)
UChar* s;
int code;
va_dcl
#endif
{
UChar *p, *q;
OnigErrorInfo* einfo;
int len, is_over;
UChar parbuf[MAX_ERROR_PAR_LEN];
va_list vargs;
va_init_list(vargs, code);
switch (code) {
case ONIGERR_UNDEFINED_NAME_REFERENCE:
case ONIGERR_UNDEFINED_GROUP_REFERENCE:
case ONIGERR_MULTIPLEX_DEFINED_NAME:
case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL:
case ONIGERR_INVALID_GROUP_NAME:
case ONIGERR_INVALID_CHAR_IN_GROUP_NAME:
case ONIGERR_INVALID_CHAR_PROPERTY_NAME:
einfo = va_arg(vargs, OnigErrorInfo*);
len = to_ascii(einfo->enc, einfo->par, einfo->par_end,
parbuf, MAX_ERROR_PAR_LEN - 3, &is_over);
q = onig_error_code_to_format(code);
p = s;
while (*q != '\0') {
if (*q == '%') {
q++;
if (*q == 'n') { /* '%n': name */
xmemcpy(p, parbuf, len);
p += len;
if (is_over != 0) {
xmemcpy(p, "...", 3);
p += 3;
}
q++;
}
else
goto normal_char;
}
else {
normal_char:
*p++ = *q++;
}
}
*p = '\0';
len = p - s;
break;
default:
q = onig_error_code_to_format(code);
len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, q);
xmemcpy(s, q, len);
s[len] = '\0';
break;
}
va_end(vargs);
return len;
}
void
#ifdef HAVE_STDARG_PROTOTYPES
onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc,
UChar* pat, UChar* pat_end, const UChar *fmt, ...)
#else
onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist)
UChar buf[];
int bufsize;
OnigEncoding enc;
UChar* pat;
UChar* pat_end;
const UChar *fmt;
va_dcl
#endif
{
int n, need, len;
UChar *p, *s, *bp;
UChar bs[6];
va_list args;
va_init_list(args, fmt);
n = xvsnprintf((char* )buf, bufsize, (const char* )fmt, args);
va_end(args);
need = (pat_end - pat) * 4 + 4;
if (n + need < bufsize) {
strcat((char* )buf, ": /");
s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf);
p = pat;
while (p < pat_end) {
if (*p == '\\') {
*s++ = *p++;
len = enclen(enc, p);
while (len-- > 0) *s++ = *p++;
}
else if (*p == '/') {
*s++ = (unsigned char )'\\';
*s++ = *p++;
}
else if (ONIGENC_IS_MBC_HEAD(enc, p)) {
len = enclen(enc, p);
if (ONIGENC_MBC_MINLEN(enc) == 1) {
while (len-- > 0) *s++ = *p++;
}
else { /* for UTF16 */
int blen;
while (len-- > 0) {
sprint_byte_with_x((char* )bs, (unsigned int )(*p++));
blen = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs);
bp = bs;
while (blen-- > 0) *s++ = *bp++;
}
}
}
else if (!ONIGENC_IS_CODE_PRINT(enc, *p) &&
!ONIGENC_IS_CODE_SPACE(enc, *p)) {
sprint_byte_with_x((char* )bs, (unsigned int )(*p++));
len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs);
bp = bs;
while (len-- > 0) *s++ = *bp++;
}
else {
*s++ = *p++;
}
}
*s++ = '/';
*s = '\0';
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,222 +0,0 @@
/**********************************************************************
regext.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
static void
conv_ext0be32(const UChar* s, const UChar* end, UChar* conv)
{
while (s < end) {
*conv++ = '\0';
*conv++ = '\0';
*conv++ = '\0';
*conv++ = *s++;
}
}
static void
conv_ext0le32(const UChar* s, const UChar* end, UChar* conv)
{
while (s < end) {
*conv++ = *s++;
*conv++ = '\0';
*conv++ = '\0';
*conv++ = '\0';
}
}
static void
conv_ext0be(const UChar* s, const UChar* end, UChar* conv)
{
while (s < end) {
*conv++ = '\0';
*conv++ = *s++;
}
}
static void
conv_ext0le(const UChar* s, const UChar* end, UChar* conv)
{
while (s < end) {
*conv++ = *s++;
*conv++ = '\0';
}
}
static void
conv_swap4bytes(const UChar* s, const UChar* end, UChar* conv)
{
while (s < end) {
*conv++ = s[3];
*conv++ = s[2];
*conv++ = s[1];
*conv++ = s[0];
s += 4;
}
}
static void
conv_swap2bytes(const UChar* s, const UChar* end, UChar* conv)
{
while (s < end) {
*conv++ = s[1];
*conv++ = s[0];
s += 2;
}
}
static int
conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* end,
UChar** conv, UChar** conv_end)
{
int len = end - s;
if (to == ONIG_ENCODING_UTF16_BE) {
if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) {
*conv = (UChar* )xmalloc(len * 2);
CHECK_NULL_RETURN_MEMERR(*conv);
*conv_end = *conv + (len * 2);
conv_ext0be(s, end, *conv);
return 0;
}
else if (from == ONIG_ENCODING_UTF16_LE) {
swap16:
*conv = (UChar* )xmalloc(len);
CHECK_NULL_RETURN_MEMERR(*conv);
*conv_end = *conv + len;
conv_swap2bytes(s, end, *conv);
return 0;
}
}
else if (to == ONIG_ENCODING_UTF16_LE) {
if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) {
*conv = (UChar* )xmalloc(len * 2);
CHECK_NULL_RETURN_MEMERR(*conv);
*conv_end = *conv + (len * 2);
conv_ext0le(s, end, *conv);
return 0;
}
else if (from == ONIG_ENCODING_UTF16_BE) {
goto swap16;
}
}
if (to == ONIG_ENCODING_UTF32_BE) {
if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) {
*conv = (UChar* )xmalloc(len * 4);
CHECK_NULL_RETURN_MEMERR(*conv);
*conv_end = *conv + (len * 4);
conv_ext0be32(s, end, *conv);
return 0;
}
else if (from == ONIG_ENCODING_UTF32_LE) {
swap32:
*conv = (UChar* )xmalloc(len);
CHECK_NULL_RETURN_MEMERR(*conv);
*conv_end = *conv + len;
conv_swap4bytes(s, end, *conv);
return 0;
}
}
else if (to == ONIG_ENCODING_UTF32_LE) {
if (from == ONIG_ENCODING_ASCII || from == ONIG_ENCODING_ISO_8859_1) {
*conv = (UChar* )xmalloc(len * 4);
CHECK_NULL_RETURN_MEMERR(*conv);
*conv_end = *conv + (len * 4);
conv_ext0le32(s, end, *conv);
return 0;
}
else if (from == ONIG_ENCODING_UTF32_BE) {
goto swap32;
}
}
return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;
}
extern int
onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
OnigCompileInfo* ci, OnigErrorInfo* einfo)
{
int r;
UChar *cpat, *cpat_end;
if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
if (ci->pattern_enc != ci->target_enc) {
r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end,
&cpat, &cpat_end);
if (r) return r;
}
else {
cpat = (UChar* )pattern;
cpat_end = (UChar* )pattern_end;
}
*reg = (regex_t* )xmalloc(sizeof(regex_t));
if (IS_NULL(*reg)) {
r = ONIGERR_MEMORY;
goto err2;
}
r = onig_reg_init(*reg, ci->option, ci->case_fold_flag, ci->target_enc,
ci->syntax);
if (r) goto err;
r = onig_compile(*reg, cpat, cpat_end, einfo);
if (r) {
err:
onig_free(*reg);
*reg = NULL;
}
err2:
if (cpat != pattern) xfree(cpat);
return r;
}
#ifdef USE_RECOMPILE_API
extern int
onig_recompile_deluxe(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
OnigCompileInfo* ci, OnigErrorInfo* einfo)
{
int r;
regex_t *new_reg;
r = onig_new_deluxe(&new_reg, pattern, pattern_end, ci, einfo);
if (r) return r;
if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
onig_transfer(reg, new_reg);
}
else {
onig_chain_link_add(reg, new_reg);
}
return 0;
}
#endif

View File

@@ -1,167 +0,0 @@
/**********************************************************************
reggnu.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
#ifndef ONIGGNU_H
#include "oniggnu.h"
#endif
extern void
re_free_registers(OnigRegion* r)
{
/* 0: don't free self */
onig_region_free(r, 0);
}
extern int
re_adjust_startpos(regex_t* reg, const char* string, int size,
int startpos, int range)
{
if (startpos > 0 && ONIGENC_MBC_MAXLEN(reg->enc) != 1 && startpos < size) {
UChar *p;
UChar *s = (UChar* )string + startpos;
if (range > 0) {
p = onigenc_get_right_adjust_char_head(reg->enc, (UChar* )string, s);
}
else {
p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, (UChar* )string, s);
}
return p - (UChar* )string;
}
return startpos;
}
extern int
re_match(regex_t* reg, const char* str, int size, int pos,
struct re_registers* regs)
{
return onig_match(reg, (UChar* )str, (UChar* )(str + size),
(UChar* )(str + pos), regs, ONIG_OPTION_NONE);
}
extern int
re_search(regex_t* bufp, const char* string, int size, int startpos, int range,
struct re_registers* regs)
{
return onig_search(bufp, (UChar* )string, (UChar* )(string + size),
(UChar* )(string + startpos),
(UChar* )(string + startpos + range),
regs, ONIG_OPTION_NONE);
}
extern int
re_compile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf)
{
int r;
OnigErrorInfo einfo;
r = onig_compile(reg, (UChar* )pattern, (UChar* )(pattern + size), &einfo);
if (r != ONIG_NORMAL) {
if (IS_NOT_NULL(ebuf))
(void )onig_error_code_to_str((UChar* )ebuf, r, &einfo);
}
return r;
}
#ifdef USE_RECOMPILE_API
extern int
re_recompile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf)
{
int r;
OnigErrorInfo einfo;
OnigEncoding enc;
/* I think encoding and options should be arguments of this function.
But this is adapted to present re.c. (2002/11/29)
*/
enc = OnigEncDefaultCharEncoding;
r = onig_recompile(reg, (UChar* )pattern, (UChar* )(pattern + size),
reg->options, enc, OnigDefaultSyntax, &einfo);
if (r != ONIG_NORMAL) {
if (IS_NOT_NULL(ebuf))
(void )onig_error_code_to_str((UChar* )ebuf, r, &einfo);
}
return r;
}
#endif
extern void
re_free_pattern(regex_t* reg)
{
onig_free(reg);
}
extern int
re_alloc_pattern(regex_t** reg)
{
*reg = (regex_t* )xmalloc(sizeof(regex_t));
if (IS_NULL(*reg)) return ONIGERR_MEMORY;
return onig_reg_init(*reg, ONIG_OPTION_DEFAULT,
ONIGENC_CASE_FOLD_DEFAULT,
OnigEncDefaultCharEncoding,
OnigDefaultSyntax);
}
extern void
re_set_casetable(const char* table)
{
onigenc_set_default_caseconv_table((UChar* )table);
}
extern void
re_mbcinit(int mb_code)
{
OnigEncoding enc;
switch (mb_code) {
case RE_MBCTYPE_ASCII:
enc = ONIG_ENCODING_ASCII;
break;
case RE_MBCTYPE_EUC:
enc = ONIG_ENCODING_EUC_JP;
break;
case RE_MBCTYPE_SJIS:
enc = ONIG_ENCODING_SJIS;
break;
case RE_MBCTYPE_UTF8:
enc = ONIG_ENCODING_UTF8;
break;
default:
return ;
break;
}
onigenc_set_default_encoding(enc);
}

View File

@@ -1,817 +0,0 @@
#ifndef REGINT_H
#define REGINT_H
/**********************************************************************
regint.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* for debug */
/* #define ONIG_DEBUG_PARSE_TREE */
/* #define ONIG_DEBUG_COMPILE */
/* #define ONIG_DEBUG_SEARCH */
/* #define ONIG_DEBUG_MATCH */
/* #define ONIG_DONT_OPTIMIZE */
/* for byte-code statistical data. */
/* #define ONIG_DEBUG_STATISTICS */
#if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \
defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \
defined(ONIG_DEBUG_STATISTICS)
#ifndef ONIG_DEBUG
#define ONIG_DEBUG
#endif
#endif
#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
(defined(__ppc__) && defined(__APPLE__)) || \
defined(__x86_64) || defined(__x86_64__) || \
defined(__mc68020__)
#define PLATFORM_UNALIGNED_WORD_ACCESS
#endif
/* config */
/* spec. config */
#define USE_NAMED_GROUP
#define USE_SUBEXP_CALL
#define USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> */
#define USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT /* /(?:()|())*\2/ */
#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */
#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
/* #define USE_RECOMPILE_API */
/* !!! moved to regenc.h. */ /* #define USE_CRNL_AS_LINE_TERMINATOR */
/* internal config */
#define USE_PARSE_TREE_NODE_RECYCLE
#define USE_OP_PUSH_OR_JUMP_EXACT
#define USE_QTFR_PEEK_NEXT
#define USE_ST_LIBRARY
#define USE_SHARED_CCLASS_TABLE
#define INIT_MATCH_STACK_SIZE 160
#define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */
#if defined(__GNUC__)
# define ARG_UNUSED __attribute__ ((unused))
#else
# define ARG_UNUSED
#endif
/* */
/* escape other system UChar definition */
#include "onig_config.h"
#ifdef ONIG_ESCAPE_UCHAR_COLLISION
#undef ONIG_ESCAPE_UCHAR_COLLISION
#endif
#define USE_WORD_BEGIN_END /* "\<", "\>" */
#define USE_CAPTURE_HISTORY
#define USE_VARIABLE_META_CHARS
#define USE_POSIX_API_REGION_OPTION
#define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
/* #define USE_COMBINATION_EXPLOSION_CHECK */ /* (X*)* */
/* #define USE_MULTI_THREAD_SYSTEM */
#define THREAD_SYSTEM_INIT /* depend on thread system */
#define THREAD_SYSTEM_END /* depend on thread system */
#define THREAD_ATOMIC_START /* depend on thread system */
#define THREAD_ATOMIC_END /* depend on thread system */
#define THREAD_PASS /* depend on thread system */
#define xmalloc malloc
#define xrealloc realloc
#define xcalloc calloc
#define xfree free
#define CHECK_INTERRUPT_IN_MATCH_AT
#define st_init_table onig_st_init_table
#define st_init_table_with_size onig_st_init_table_with_size
#define st_init_numtable onig_st_init_numtable
#define st_init_numtable_with_size onig_st_init_numtable_with_size
#define st_init_strtable onig_st_init_strtable
#define st_init_strtable_with_size onig_st_init_strtable_with_size
#define st_delete onig_st_delete
#define st_delete_safe onig_st_delete_safe
#define st_insert onig_st_insert
#define st_lookup onig_st_lookup
#define st_foreach onig_st_foreach
#define st_add_direct onig_st_add_direct
#define st_free_table onig_st_free_table
#define st_cleanup_safe onig_st_cleanup_safe
#define st_copy onig_st_copy
#define st_nothing_key_clone onig_st_nothing_key_clone
#define st_nothing_key_free onig_st_nothing_key_free
/* */
#define onig_st_is_member st_is_member
#define STATE_CHECK_STRING_THRESHOLD_LEN 7
#define STATE_CHECK_BUFF_MAX_SIZE 0x4000
#define THREAD_PASS_LIMIT_COUNT 8
#define xmemset memset
#define xmemcpy memcpy
#define xmemmove memmove
#if defined(_WIN32) && !defined(__GNUC__)
#define xalloca _alloca
#define xvsnprintf _vsnprintf
#else
#define xalloca alloca
#define xvsnprintf vsnprintf
#endif
#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM)
#define ONIG_STATE_INC(reg) (reg)->state++
#define ONIG_STATE_DEC(reg) (reg)->state--
#define ONIG_STATE_INC_THREAD(reg) do {\
THREAD_ATOMIC_START;\
(reg)->state++;\
THREAD_ATOMIC_END;\
} while(0)
#define ONIG_STATE_DEC_THREAD(reg) do {\
THREAD_ATOMIC_START;\
(reg)->state--;\
THREAD_ATOMIC_END;\
} while(0)
#else
#define ONIG_STATE_INC(reg) /* Nothing */
#define ONIG_STATE_DEC(reg) /* Nothing */
#define ONIG_STATE_INC_THREAD(reg) /* Nothing */
#define ONIG_STATE_DEC_THREAD(reg) /* Nothing */
#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#if defined(HAVE_ALLOCA_H) && !defined(_WIN32) && !defined(__GNUC__)
#include <alloca.h>
#endif
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#include <ctype.h>
#ifdef HAVE_SYS_TYPES_H
#ifndef __BORLANDC__
#include <sys/types.h>
#endif
#endif
#ifdef __BORLANDC__
#include <malloc.h>
#endif
#ifdef ONIG_DEBUG
# include <stdio.h>
#endif
#include "regenc.h"
#ifdef MIN
#undef MIN
#endif
#ifdef MAX
#undef MAX
#endif
#define MIN(a,b) (((a)>(b))?(b):(a))
#define MAX(a,b) (((a)<(b))?(b):(a))
#define IS_NULL(p) (((void*)(p)) == (void*)0)
#define IS_NOT_NULL(p) (((void*)(p)) != (void*)0)
#define CHECK_NULL_RETURN(p) if (IS_NULL(p)) return NULL
#define CHECK_NULL_RETURN_MEMERR(p) if (IS_NULL(p)) return ONIGERR_MEMORY
#define NULL_UCHARP ((UChar* )0)
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
#define PLATFORM_GET_INC(val,p,type) do{\
val = *(type* )p;\
(p) += sizeof(type);\
} while(0)
#else
#define PLATFORM_GET_INC(val,p,type) do{\
xmemcpy(&val, (p), sizeof(type));\
(p) += sizeof(type);\
} while(0)
/* sizeof(OnigCodePoint) */
#define WORD_ALIGNMENT_SIZE SIZEOF_LONG
#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\
(pad_size) = WORD_ALIGNMENT_SIZE \
- ((unsigned int )(addr) % WORD_ALIGNMENT_SIZE);\
if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\
} while (0)
#define ALIGNMENT_RIGHT(addr) do {\
(addr) += (WORD_ALIGNMENT_SIZE - 1);\
(addr) -= ((unsigned int )(addr) % WORD_ALIGNMENT_SIZE);\
} while (0)
#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */
/* stack pop level */
#define STACK_POP_LEVEL_FREE 0
#define STACK_POP_LEVEL_MEM_START 1
#define STACK_POP_LEVEL_ALL 2
/* optimize flags */
#define ONIG_OPTIMIZE_NONE 0
#define ONIG_OPTIMIZE_EXACT 1 /* Slow Search */
#define ONIG_OPTIMIZE_EXACT_BM 2 /* Boyer Moore Search */
#define ONIG_OPTIMIZE_EXACT_BM_NOT_REV 3 /* BM (but not simple match) */
#define ONIG_OPTIMIZE_EXACT_IC 4 /* Slow Search (ignore case) */
#define ONIG_OPTIMIZE_MAP 5 /* char map */
/* bit status */
typedef unsigned int BitStatusType;
#define BIT_STATUS_BITS_NUM (sizeof(BitStatusType) * 8)
#define BIT_STATUS_CLEAR(stats) (stats) = 0
#define BIT_STATUS_ON_ALL(stats) (stats) = ~((BitStatusType )0)
#define BIT_STATUS_AT(stats,n) \
((n) < (int )BIT_STATUS_BITS_NUM ? ((stats) & (1 << n)) : ((stats) & 1))
#define BIT_STATUS_ON_AT(stats,n) do {\
if ((n) < (int )BIT_STATUS_BITS_NUM) \
(stats) |= (1 << (n));\
else\
(stats) |= 1;\
} while (0)
#define BIT_STATUS_ON_AT_SIMPLE(stats,n) do {\
if ((n) < (int )BIT_STATUS_BITS_NUM)\
(stats) |= (1 << (n));\
} while (0)
#define INT_MAX_LIMIT ((1UL << (SIZEOF_INT * 8 - 1)) - 1)
#define DIGITVAL(code) ((code) - '0')
#define ODIGITVAL(code) DIGITVAL(code)
#define XDIGITVAL(enc,code) \
(ONIGENC_IS_CODE_DIGIT(enc,code) ? DIGITVAL(code) \
: (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10))
#define IS_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE)
#define IS_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE)
#define IS_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE)
#define IS_EXTEND(option) ((option) & ONIG_OPTION_EXTEND)
#define IS_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST)
#define IS_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY)
#define IS_FIND_CONDITION(option) ((option) & \
(ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY))
#define IS_NOTBOL(option) ((option) & ONIG_OPTION_NOTBOL)
#define IS_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL)
#define IS_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION)
/* OP_SET_OPTION is required for these options.
#define IS_DYNAMIC_OPTION(option) \
(((option) & (ONIG_OPTION_MULTILINE | ONIG_OPTION_IGNORECASE)) != 0)
*/
/* ignore-case and multibyte status are included in compiled code. */
#define IS_DYNAMIC_OPTION(option) 0
#define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \
((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR)
#define REPEAT_INFINITE -1
#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE)
/* bitset */
#define BITS_PER_BYTE 8
#define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE)
#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE)
#define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM)
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
typedef unsigned int Bits;
#else
typedef unsigned char Bits;
#endif
typedef Bits BitSet[BITSET_SIZE];
typedef Bits* BitSetRef;
#define SIZE_BITSET sizeof(BitSet)
#define BITSET_CLEAR(bs) do {\
int i;\
for (i = 0; i < (int )BITSET_SIZE; i++) { (bs)[i] = 0; } \
} while (0)
#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM]
#define BS_BIT(pos) (1 << (pos % BITS_IN_ROOM))
#define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos))
#define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos)
#define BITSET_CLEAR_BIT(bs, pos) BS_ROOM(bs,pos) &= ~(BS_BIT(pos))
#define BITSET_INVERT_BIT(bs, pos) BS_ROOM(bs,pos) ^= BS_BIT(pos)
/* bytes buffer */
typedef struct _BBuf {
UChar* p;
unsigned int used;
unsigned int alloc;
} BBuf;
#define BBUF_INIT(buf,size) onig_bbuf_init((BBuf* )(buf), (size))
#define BBUF_SIZE_INC(buf,inc) do{\
(buf)->alloc += (inc);\
(buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\
if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
} while (0)
#define BBUF_EXPAND(buf,low) do{\
do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\
(buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\
if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
} while (0)
#define BBUF_ENSURE_SIZE(buf,size) do{\
unsigned int new_alloc = (buf)->alloc;\
while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\
if ((buf)->alloc != new_alloc) {\
(buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\
if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
(buf)->alloc = new_alloc;\
}\
} while (0)
#define BBUF_WRITE(buf,pos,bytes,n) do{\
int used = (pos) + (n);\
if ((buf)->alloc < (unsigned int )used) BBUF_EXPAND((buf),used);\
xmemcpy((buf)->p + (pos), (bytes), (n));\
if ((buf)->used < (unsigned int )used) (buf)->used = used;\
} while (0)
#define BBUF_WRITE1(buf,pos,byte) do{\
int used = (pos) + 1;\
if ((buf)->alloc < (unsigned int )used) BBUF_EXPAND((buf),used);\
(buf)->p[(pos)] = (byte);\
if ((buf)->used < (unsigned int )used) (buf)->used = used;\
} while (0)
#define BBUF_ADD(buf,bytes,n) BBUF_WRITE((buf),(buf)->used,(bytes),(n))
#define BBUF_ADD1(buf,byte) BBUF_WRITE1((buf),(buf)->used,(byte))
#define BBUF_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used)
#define BBUF_GET_OFFSET_POS(buf) ((buf)->used)
/* from < to */
#define BBUF_MOVE_RIGHT(buf,from,to,n) do {\
if ((unsigned int )((to)+(n)) > (buf)->alloc) BBUF_EXPAND((buf),(to) + (n));\
xmemmove((buf)->p + (to), (buf)->p + (from), (n));\
if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\
} while (0)
/* from > to */
#define BBUF_MOVE_LEFT(buf,from,to,n) do {\
xmemmove((buf)->p + (to), (buf)->p + (from), (n));\
} while (0)
/* from > to */
#define BBUF_MOVE_LEFT_REDUCE(buf,from,to) do {\
xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\
(buf)->used -= (from - to);\
} while (0)
#define BBUF_INSERT(buf,pos,bytes,n) do {\
if (pos >= (buf)->used) {\
BBUF_WRITE(buf,pos,bytes,n);\
}\
else {\
BBUF_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\
xmemcpy((buf)->p + (pos), (bytes), (n));\
}\
} while (0)
#define BBUF_GET_BYTE(buf, pos) (buf)->p[(pos)]
#define ANCHOR_BEGIN_BUF (1<<0)
#define ANCHOR_BEGIN_LINE (1<<1)
#define ANCHOR_BEGIN_POSITION (1<<2)
#define ANCHOR_END_BUF (1<<3)
#define ANCHOR_SEMI_END_BUF (1<<4)
#define ANCHOR_END_LINE (1<<5)
#define ANCHOR_WORD_BOUND (1<<6)
#define ANCHOR_NOT_WORD_BOUND (1<<7)
#define ANCHOR_WORD_BEGIN (1<<8)
#define ANCHOR_WORD_END (1<<9)
#define ANCHOR_PREC_READ (1<<10)
#define ANCHOR_PREC_READ_NOT (1<<11)
#define ANCHOR_LOOK_BEHIND (1<<12)
#define ANCHOR_LOOK_BEHIND_NOT (1<<13)
#define ANCHOR_ANYCHAR_STAR (1<<14) /* ".*" optimize info */
#define ANCHOR_ANYCHAR_STAR_ML (1<<15) /* ".*" optimize info (multi-line) */
/* operation code */
enum OpCode {
OP_FINISH = 0, /* matching process terminator (no more alternative) */
OP_END = 1, /* pattern code terminator (success end) */
OP_EXACT1 = 2, /* single byte, N = 1 */
OP_EXACT2, /* single byte, N = 2 */
OP_EXACT3, /* single byte, N = 3 */
OP_EXACT4, /* single byte, N = 4 */
OP_EXACT5, /* single byte, N = 5 */
OP_EXACTN, /* single byte */
OP_EXACTMB2N1, /* mb-length = 2 N = 1 */
OP_EXACTMB2N2, /* mb-length = 2 N = 2 */
OP_EXACTMB2N3, /* mb-length = 2 N = 3 */
OP_EXACTMB2N, /* mb-length = 2 */
OP_EXACTMB3N, /* mb-length = 3 */
OP_EXACTMBN, /* other length */
OP_EXACT1_IC, /* single byte, N = 1, ignore case */
OP_EXACTN_IC, /* single byte, ignore case */
OP_CCLASS,
OP_CCLASS_MB,
OP_CCLASS_MIX,
OP_CCLASS_NOT,
OP_CCLASS_MB_NOT,
OP_CCLASS_MIX_NOT,
OP_CCLASS_NODE, /* pointer to CClassNode node */
OP_ANYCHAR, /* "." */
OP_ANYCHAR_ML, /* "." multi-line */
OP_ANYCHAR_STAR, /* ".*" */
OP_ANYCHAR_ML_STAR, /* ".*" multi-line */
OP_ANYCHAR_STAR_PEEK_NEXT,
OP_ANYCHAR_ML_STAR_PEEK_NEXT,
OP_WORD,
OP_NOT_WORD,
OP_WORD_BOUND,
OP_NOT_WORD_BOUND,
OP_WORD_BEGIN,
OP_WORD_END,
OP_BEGIN_BUF,
OP_END_BUF,
OP_BEGIN_LINE,
OP_END_LINE,
OP_SEMI_END_BUF,
OP_BEGIN_POSITION,
OP_BACKREF1,
OP_BACKREF2,
OP_BACKREFN,
OP_BACKREFN_IC,
OP_BACKREF_MULTI,
OP_BACKREF_MULTI_IC,
OP_BACKREF_WITH_LEVEL, /* \k<xxx+n>, \k<xxx-n> */
OP_MEMORY_START,
OP_MEMORY_START_PUSH, /* push back-tracker to stack */
OP_MEMORY_END_PUSH, /* push back-tracker to stack */
OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */
OP_MEMORY_END,
OP_MEMORY_END_REC, /* push marker to stack */
OP_FAIL, /* pop stack and move */
OP_JUMP,
OP_PUSH,
OP_POP,
OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */
OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */
OP_REPEAT, /* {n,m} */
OP_REPEAT_NG, /* {n,m}? (non greedy) */
OP_REPEAT_INC,
OP_REPEAT_INC_NG, /* non greedy */
OP_REPEAT_INC_SG, /* search and get in stack */
OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */
OP_NULL_CHECK_START, /* null loop checker start */
OP_NULL_CHECK_END, /* null loop checker end */
OP_NULL_CHECK_END_MEMST, /* null loop checker end (with capture status) */
OP_NULL_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */
OP_PUSH_POS, /* (?=...) start */
OP_POP_POS, /* (?=...) end */
OP_PUSH_POS_NOT, /* (?!...) start */
OP_FAIL_POS, /* (?!...) end */
OP_PUSH_STOP_BT, /* (?>...) start */
OP_POP_STOP_BT, /* (?>...) end */
OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */
OP_PUSH_LOOK_BEHIND_NOT, /* (?<!...) start */
OP_FAIL_LOOK_BEHIND_NOT, /* (?<!...) end */
OP_CALL, /* \g<name> */
OP_RETURN,
OP_STATE_CHECK_PUSH, /* combination explosion check and push */
OP_STATE_CHECK_PUSH_OR_JUMP, /* check ok -> push, else jump */
OP_STATE_CHECK, /* check only */
OP_STATE_CHECK_ANYCHAR_STAR,
OP_STATE_CHECK_ANYCHAR_ML_STAR,
/* no need: IS_DYNAMIC_OPTION() == 0 */
OP_SET_OPTION_PUSH, /* set option and push recover option */
OP_SET_OPTION /* set option */
};
typedef int RelAddrType;
typedef int AbsAddrType;
typedef int LengthType;
typedef int RepeatNumType;
typedef short int MemNumType;
typedef short int StateCheckNumType;
typedef void* PointerType;
#define SIZE_OPCODE 1
#define SIZE_RELADDR sizeof(RelAddrType)
#define SIZE_ABSADDR sizeof(AbsAddrType)
#define SIZE_LENGTH sizeof(LengthType)
#define SIZE_MEMNUM sizeof(MemNumType)
#define SIZE_STATE_CHECK_NUM sizeof(StateCheckNumType)
#define SIZE_REPEATNUM sizeof(RepeatNumType)
#define SIZE_OPTION sizeof(OnigOptionType)
#define SIZE_CODE_POINT sizeof(OnigCodePoint)
#define SIZE_POINTER sizeof(PointerType)
#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType)
#define GET_ABSADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, AbsAddrType)
#define GET_LENGTH_INC(len,p) PLATFORM_GET_INC(len, p, LengthType)
#define GET_MEMNUM_INC(num,p) PLATFORM_GET_INC(num, p, MemNumType)
#define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType)
#define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType)
#define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType)
#define GET_STATE_CHECK_NUM_INC(num,p) PLATFORM_GET_INC(num, p, StateCheckNumType)
/* code point's address must be aligned address. */
#define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p))
#define GET_BYTE_INC(byte,p) do{\
byte = *(p);\
(p)++;\
} while(0)
/* op-code + arg size */
#define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE
#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1)
#define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR)
#define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR)
#define SIZE_OP_POP SIZE_OPCODE
#define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1)
#define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1)
#define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_PUSH_POS SIZE_OPCODE
#define SIZE_OP_PUSH_POS_NOT (SIZE_OPCODE + SIZE_RELADDR)
#define SIZE_OP_POP_POS SIZE_OPCODE
#define SIZE_OP_FAIL_POS SIZE_OPCODE
#define SIZE_OP_SET_OPTION (SIZE_OPCODE + SIZE_OPTION)
#define SIZE_OP_SET_OPTION_PUSH (SIZE_OPCODE + SIZE_OPTION)
#define SIZE_OP_FAIL SIZE_OPCODE
#define SIZE_OP_MEMORY_START (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_MEMORY_START_PUSH (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_MEMORY_END_PUSH (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_MEMORY_END_PUSH_REC (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_MEMORY_END (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_MEMORY_END_REC (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_PUSH_STOP_BT SIZE_OPCODE
#define SIZE_OP_POP_STOP_BT SIZE_OPCODE
#define SIZE_OP_NULL_CHECK_START (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_NULL_CHECK_END (SIZE_OPCODE + SIZE_MEMNUM)
#define SIZE_OP_LOOK_BEHIND (SIZE_OPCODE + SIZE_LENGTH)
#define SIZE_OP_PUSH_LOOK_BEHIND_NOT (SIZE_OPCODE + SIZE_RELADDR + SIZE_LENGTH)
#define SIZE_OP_FAIL_LOOK_BEHIND_NOT SIZE_OPCODE
#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR)
#define SIZE_OP_RETURN SIZE_OPCODE
#ifdef USE_COMBINATION_EXPLOSION_CHECK
#define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM)
#define SIZE_OP_STATE_CHECK_PUSH (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR)
#define SIZE_OP_STATE_CHECK_PUSH_OR_JUMP (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR)
#define SIZE_OP_STATE_CHECK_ANYCHAR_STAR (SIZE_OPCODE + SIZE_STATE_CHECK_NUM)
#endif
#define MC_ESC(syn) (syn)->meta_char_table.esc
#define MC_ANYCHAR(syn) (syn)->meta_char_table.anychar
#define MC_ANYTIME(syn) (syn)->meta_char_table.anytime
#define MC_ZERO_OR_ONE_TIME(syn) (syn)->meta_char_table.zero_or_one_time
#define MC_ONE_OR_MORE_TIME(syn) (syn)->meta_char_table.one_or_more_time
#define MC_ANYCHAR_ANYTIME(syn) (syn)->meta_char_table.anychar_anytime
#define IS_MC_ESC_CODE(code, syn) \
((code) == MC_ESC(syn) && \
!IS_SYNTAX_OP2((syn), ONIG_SYN_OP2_INEFFECTIVE_ESCAPE))
#define SYN_POSIX_COMMON_OP \
( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \
ONIG_SYN_OP_DECIMAL_BACKREF | \
ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \
ONIG_SYN_OP_LINE_ANCHOR | \
ONIG_SYN_OP_ESC_CONTROL_CHARS )
#define SYN_GNU_REGEX_OP \
( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \
ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \
ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \
ONIG_SYN_OP_VBAR_ALT | \
ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \
ONIG_SYN_OP_QMARK_ZERO_ONE | \
ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \
ONIG_SYN_OP_ESC_W_WORD | \
ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \
ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \
ONIG_SYN_OP_LINE_ANCHOR )
#define SYN_GNU_REGEX_BV \
( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \
ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \
ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC )
#define NCCLASS_FLAGS(cc) ((cc)->flags)
#define NCCLASS_FLAG_SET(cc,flag) (NCCLASS_FLAGS(cc) |= (flag))
#define NCCLASS_FLAG_CLEAR(cc,flag) (NCCLASS_FLAGS(cc) &= ~(flag))
#define IS_NCCLASS_FLAG_ON(cc,flag) ((NCCLASS_FLAGS(cc) & (flag)) != 0)
/* cclass node */
#define FLAG_NCCLASS_NOT (1<<0)
#define FLAG_NCCLASS_SHARE (1<<1)
#define NCCLASS_SET_NOT(nd) NCCLASS_FLAG_SET(nd, FLAG_NCCLASS_NOT)
#define NCCLASS_SET_SHARE(nd) NCCLASS_FLAG_SET(nd, FLAG_NCCLASS_SHARE)
#define NCCLASS_CLEAR_NOT(nd) NCCLASS_FLAG_CLEAR(nd, FLAG_NCCLASS_NOT)
#define IS_NCCLASS_NOT(nd) IS_NCCLASS_FLAG_ON(nd, FLAG_NCCLASS_NOT)
#define IS_NCCLASS_SHARE(nd) IS_NCCLASS_FLAG_ON(nd, FLAG_NCCLASS_SHARE)
typedef struct {
int type;
/* struct _Node* next; */
/* unsigned int flags; */
} NodeBase;
typedef struct {
NodeBase base;
unsigned int flags;
BitSet bs;
BBuf* mbuf; /* multi-byte info or NULL */
} CClassNode;
typedef long OnigStackIndex;
typedef struct _OnigStackType {
unsigned int type;
union {
struct {
UChar *pcode; /* byte code position */
UChar *pstr; /* string position */
UChar *pstr_prev; /* previous char position of pstr */
#ifdef USE_COMBINATION_EXPLOSION_CHECK
unsigned int state_check;
#endif
} state;
struct {
int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */
UChar *pcode; /* byte code position (head of repeated target) */
int num; /* repeat id */
} repeat;
struct {
OnigStackIndex si; /* index of stack */
} repeat_inc;
struct {
int num; /* memory num */
UChar *pstr; /* start/end position */
/* Following information is setted, if this stack type is MEM-START */
OnigStackIndex start; /* prev. info (for backtrack "(...)*" ) */
OnigStackIndex end; /* prev. info (for backtrack "(...)*" ) */
} mem;
struct {
int num; /* null check id */
UChar *pstr; /* start position */
} null_check;
#ifdef USE_SUBEXP_CALL
struct {
UChar *ret_addr; /* byte code position */
int num; /* null check id */
UChar *pstr; /* string position */
} call_frame;
#endif
} u;
} OnigStackType;
typedef struct {
void* stack_p;
int stack_n;
OnigOptionType options;
OnigRegion* region;
const UChar* start; /* search start position (for \G: BEGIN_POSITION) */
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
int best_len; /* for ONIG_OPTION_FIND_LONGEST */
UChar* best_s;
#endif
#ifdef USE_COMBINATION_EXPLOSION_CHECK
void* state_check_buff;
int state_check_buff_size;
#endif
} OnigMatchArg;
#define IS_CODE_SB_WORD(enc,code) \
(ONIGENC_IS_CODE_ASCII(code) && ONIGENC_IS_CODE_WORD(enc,code))
typedef struct OnigEndCallListItem {
struct OnigEndCallListItem* next;
void (*func)(void);
} OnigEndCallListItemType;
extern void onig_add_end_call(void (*func)(void));
#ifdef ONIG_DEBUG
typedef struct {
short int opcode;
char* name;
short int arg_type;
} OnigOpInfoType;
extern OnigOpInfoType OnigOpInfo[];
extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp, OnigEncoding enc));
#ifdef ONIG_DEBUG_STATISTICS
extern void onig_statistics_init P_((void));
extern void onig_print_statistics P_((FILE* f));
#endif
#endif
extern UChar* onig_error_code_to_format P_((int code));
extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...));
extern int onig_bbuf_init P_((BBuf* buf, int size));
extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo));
extern void onig_chain_reduce P_((regex_t* reg));
extern void onig_chain_link_add P_((regex_t* to, regex_t* add));
extern void onig_transfer P_((regex_t* to, regex_t* from));
extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc));
extern int onig_is_code_in_cc_len P_((int enclen, OnigCodePoint code, CClassNode* cc));
/* strend hash */
typedef void hash_table_type;
typedef unsigned long hash_data_type;
extern hash_table_type* onig_st_init_strend_table_with_size P_((int size));
extern int onig_st_lookup_strend P_((hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type *value));
extern int onig_st_insert_strend P_((hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type value));
/* encoding property management */
#define PROPERTY_LIST_ADD_PROP(Name, CR) \
r = onigenc_property_list_add_property((UChar* )Name, CR,\
&PropertyNameTable, &PropertyList, &PropertyListNum,\
&PropertyListSize);\
if (r != 0) goto end
#define PROPERTY_LIST_INIT_CHECK \
if (PropertyInited == 0) {\
int r = onigenc_property_list_init(init_property_list);\
if (r != 0) return r;\
}
extern int onigenc_property_list_add_property P_((UChar* name, const OnigCodePoint* prop, hash_table_type **table, const OnigCodePoint*** plist, int *pnum, int *psize));
typedef int (*ONIGENC_INIT_PROPERTY_LIST_FUNC_TYPE)(void);
extern int onigenc_property_list_init P_((ONIGENC_INIT_PROPERTY_LIST_FUNC_TYPE));
#endif /* REGINT_H */

File diff suppressed because it is too large Load Diff

View File

@@ -1,351 +0,0 @@
#ifndef REGPARSE_H
#define REGPARSE_H
/**********************************************************************
regparse.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
/* node type */
#define NT_STR 0
#define NT_CCLASS 1
#define NT_CTYPE 2
#define NT_CANY 3
#define NT_BREF 4
#define NT_QTFR 5
#define NT_ENCLOSE 6
#define NT_ANCHOR 7
#define NT_LIST 8
#define NT_ALT 9
#define NT_CALL 10
/* node type bit */
#define NTYPE2BIT(type) (1<<(type))
#define BIT_NT_STR NTYPE2BIT(NT_STR)
#define BIT_NT_CCLASS NTYPE2BIT(NT_CCLASS)
#define BIT_NT_CTYPE NTYPE2BIT(NT_CTYPE)
#define BIT_NT_CANY NTYPE2BIT(NT_CANY)
#define BIT_NT_BREF NTYPE2BIT(NT_BREF)
#define BIT_NT_QTFR NTYPE2BIT(NT_QTFR)
#define BIT_NT_ENCLOSE NTYPE2BIT(NT_ENCLOSE)
#define BIT_NT_ANCHOR NTYPE2BIT(NT_ANCHOR)
#define BIT_NT_LIST NTYPE2BIT(NT_LIST)
#define BIT_NT_ALT NTYPE2BIT(NT_ALT)
#define BIT_NT_CALL NTYPE2BIT(NT_CALL)
#define IS_NODE_TYPE_SIMPLE(type) \
((NTYPE2BIT(type) & (BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE |\
BIT_NT_CANY | BIT_NT_BREF)) != 0)
#define NTYPE(node) ((node)->u.base.type)
#define SET_NTYPE(node, ntype) (node)->u.base.type = (ntype)
#define NSTR(node) (&((node)->u.str))
#define NCCLASS(node) (&((node)->u.cclass))
#define NCTYPE(node) (&((node)->u.ctype))
#define NBREF(node) (&((node)->u.bref))
#define NQTFR(node) (&((node)->u.qtfr))
#define NENCLOSE(node) (&((node)->u.enclose))
#define NANCHOR(node) (&((node)->u.anchor))
#define NCONS(node) (&((node)->u.cons))
#define NCALL(node) (&((node)->u.call))
#define NCAR(node) (NCONS(node)->car)
#define NCDR(node) (NCONS(node)->cdr)
#define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML)
#define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)
#define ENCLOSE_MEMORY (1<<0)
#define ENCLOSE_OPTION (1<<1)
#define ENCLOSE_STOP_BACKTRACK (1<<2)
#define NODE_STR_MARGIN 16
#define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */
#define NODE_BACKREFS_SIZE 6
#define NSTR_RAW (1<<0) /* by backslashed number */
#define NSTR_AMBIG (1<<1)
#define NSTR_DONT_GET_OPT_INFO (1<<2)
#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s)
#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW
#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW
#define NSTRING_SET_AMBIG(node) (node)->u.str.flag |= NSTR_AMBIG
#define NSTRING_SET_DONT_GET_OPT_INFO(node) \
(node)->u.str.flag |= NSTR_DONT_GET_OPT_INFO
#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0)
#define NSTRING_IS_AMBIG(node) (((node)->u.str.flag & NSTR_AMBIG) != 0)
#define NSTRING_IS_DONT_GET_OPT_INFO(node) \
(((node)->u.str.flag & NSTR_DONT_GET_OPT_INFO) != 0)
#define BACKREFS_P(br) \
(IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static);
#define NQ_TARGET_ISNOT_EMPTY 0
#define NQ_TARGET_IS_EMPTY 1
#define NQ_TARGET_IS_EMPTY_MEM 2
#define NQ_TARGET_IS_EMPTY_REC 3
/* status bits */
#define NST_MIN_FIXED (1<<0)
#define NST_MAX_FIXED (1<<1)
#define NST_CLEN_FIXED (1<<2)
#define NST_MARK1 (1<<3)
#define NST_MARK2 (1<<4)
#define NST_MEM_BACKREFED (1<<5)
#define NST_STOP_BT_SIMPLE_REPEAT (1<<6)
#define NST_RECURSION (1<<7)
#define NST_CALLED (1<<8)
#define NST_ADDR_FIXED (1<<9)
#define NST_NAMED_GROUP (1<<10)
#define NST_NAME_REF (1<<11)
#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */
#define NST_NEST_LEVEL (1<<13)
#define NST_BY_NUMBER (1<<14) /* {n,m} */
#define SET_ENCLOSE_STATUS(node,f) (node)->u.enclose.state |= (f)
#define CLEAR_ENCLOSE_STATUS(node,f) (node)->u.enclose.state &= ~(f)
#define IS_ENCLOSE_CALLED(en) (((en)->state & NST_CALLED) != 0)
#define IS_ENCLOSE_ADDR_FIXED(en) (((en)->state & NST_ADDR_FIXED) != 0)
#define IS_ENCLOSE_RECURSION(en) (((en)->state & NST_RECURSION) != 0)
#define IS_ENCLOSE_MARK1(en) (((en)->state & NST_MARK1) != 0)
#define IS_ENCLOSE_MARK2(en) (((en)->state & NST_MARK2) != 0)
#define IS_ENCLOSE_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0)
#define IS_ENCLOSE_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0)
#define IS_ENCLOSE_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0)
#define IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(en) \
(((en)->state & NST_STOP_BT_SIMPLE_REPEAT) != 0)
#define IS_ENCLOSE_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0)
#define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION
#define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0)
#define IS_CALL_NAME_REF(cn) (((cn)->state & NST_NAME_REF) != 0)
#define IS_BACKREF_NAME_REF(bn) (((bn)->state & NST_NAME_REF) != 0)
#define IS_BACKREF_NEST_LEVEL(bn) (((bn)->state & NST_NEST_LEVEL) != 0)
#define IS_QUANTIFIER_IN_REPEAT(qn) (((qn)->state & NST_IN_REPEAT) != 0)
#define IS_QUANTIFIER_BY_NUMBER(qn) (((qn)->state & NST_BY_NUMBER) != 0)
#define CALLNODE_REFNUM_UNDEF -1
typedef struct {
NodeBase base;
UChar* s;
UChar* end;
unsigned int flag;
int capa; /* (allocated size - 1) or 0: use buf[] */
UChar buf[NODE_STR_BUF_SIZE];
} StrNode;
typedef struct {
NodeBase base;
int state;
struct _Node* target;
int lower;
int upper;
int greedy;
int target_empty_info;
struct _Node* head_exact;
struct _Node* next_head_exact;
int is_refered; /* include called node. don't eliminate even if {0} */
#ifdef USE_COMBINATION_EXPLOSION_CHECK
int comb_exp_check_num; /* 1,2,3...: check, 0: no check */
#endif
} QtfrNode;
typedef struct {
NodeBase base;
int state;
int type;
int regnum;
OnigOptionType option;
struct _Node* target;
AbsAddrType call_addr;
/* for multiple call reference */
OnigDistance min_len; /* min length (byte) */
OnigDistance max_len; /* max length (byte) */
int char_len; /* character length */
int opt_count; /* referenced count in optimize_node_left() */
} EncloseNode;
#ifdef USE_SUBEXP_CALL
typedef struct {
int offset;
struct _Node* target;
} UnsetAddr;
typedef struct {
int num;
int alloc;
UnsetAddr* us;
} UnsetAddrList;
typedef struct {
NodeBase base;
int state;
int group_num;
UChar* name;
UChar* name_end;
struct _Node* target; /* EncloseNode : ENCLOSE_MEMORY */
UnsetAddrList* unset_addr_list;
} CallNode;
#endif
typedef struct {
NodeBase base;
int state;
int back_num;
int back_static[NODE_BACKREFS_SIZE];
int* back_dynamic;
int nest_level;
} BRefNode;
typedef struct {
NodeBase base;
int type;
struct _Node* target;
int char_len;
} AnchorNode;
typedef struct {
NodeBase base;
struct _Node* car;
struct _Node* cdr;
} ConsAltNode;
typedef struct {
NodeBase base;
int ctype;
int not;
} CtypeNode;
typedef struct _Node {
union {
NodeBase base;
StrNode str;
CClassNode cclass;
QtfrNode qtfr;
EncloseNode enclose;
BRefNode bref;
AnchorNode anchor;
ConsAltNode cons;
CtypeNode ctype;
#ifdef USE_SUBEXP_CALL
CallNode call;
#endif
} u;
} Node;
#define NULL_NODE ((Node* )0)
#define SCANENV_MEMNODES_SIZE 8
#define SCANENV_MEM_NODES(senv) \
(IS_NOT_NULL((senv)->mem_nodes_dynamic) ? \
(senv)->mem_nodes_dynamic : (senv)->mem_nodes_static)
typedef struct {
OnigOptionType option;
OnigCaseFoldType case_fold_flag;
OnigEncoding enc;
OnigSyntaxType* syntax;
BitStatusType capture_history;
BitStatusType bt_mem_start;
BitStatusType bt_mem_end;
BitStatusType backrefed_mem;
UChar* pattern;
UChar* pattern_end;
UChar* error;
UChar* error_end;
regex_t* reg; /* for reg->names only */
int num_call;
#ifdef USE_SUBEXP_CALL
UnsetAddrList* unset_addr_list;
#endif
int num_mem;
#ifdef USE_NAMED_GROUP
int num_named;
#endif
int mem_alloc;
Node* mem_nodes_static[SCANENV_MEMNODES_SIZE];
Node** mem_nodes_dynamic;
#ifdef USE_COMBINATION_EXPLOSION_CHECK
int num_comb_exp_check;
int comb_exp_max_regnum;
int curr_max_regnum;
int has_recursion;
#endif
} ScanEnv;
#define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0)
#define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0)
#define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0)
#ifdef USE_NAMED_GROUP
typedef struct {
int new_val;
} GroupNumRemap;
extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map));
#endif
extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n));
extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end));
extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end));
extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc));
extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode));
extern void onig_node_conv_to_str_node P_((Node* node, int raw));
extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end));
extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end));
extern void onig_node_free P_((Node* node));
extern Node* onig_node_new_enclose P_((int type));
extern Node* onig_node_new_anchor P_((int type));
extern Node* onig_node_new_str P_((const UChar* s, const UChar* end));
extern Node* onig_node_new_list P_((Node* left, Node* right));
extern Node* onig_node_list_add P_((Node* list, Node* x));
extern Node* onig_node_new_alt P_((Node* left, Node* right));
extern void onig_node_str_clear P_((Node* node));
extern int onig_free_node_list P_((void));
extern int onig_names_free P_((regex_t* reg));
extern int onig_parse_make_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env));
extern int onig_free_shared_cclass_table P_((void));
#ifdef ONIG_DEBUG
#ifdef USE_NAMED_GROUP
extern int onig_print_names(FILE*, regex_t*);
#endif
#endif
#endif /* REGPARSE_H */

View File

@@ -1,98 +0,0 @@
/**********************************************************************
regposerr.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "onig_config.h"
#include "onigposix.h"
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#if defined(__GNUC__)
# define ARG_UNUSED __attribute__ ((unused))
#else
# define ARG_UNUSED
#endif
static char* ESTRING[] = {
NULL,
"failed to match", /* REG_NOMATCH */
"Invalid regular expression", /* REG_BADPAT */
"invalid collating element referenced", /* REG_ECOLLATE */
"invalid character class type referenced", /* REG_ECTYPE */
"bad backslash-escape sequence", /* REG_EESCAPE */
"invalid back reference number", /* REG_ESUBREG */
"imbalanced [ and ]", /* REG_EBRACK */
"imbalanced ( and )", /* REG_EPAREN */
"imbalanced { and }", /* REG_EBRACE */
"invalid repeat range {n,m}", /* REG_BADBR */
"invalid range", /* REG_ERANGE */
"Out of memory", /* REG_ESPACE */
"? * + not preceded by valid regular expression", /* REG_BADRPT */
/* Extended errors */
"internal error", /* REG_EONIG_INTERNAL */
"invalid wide char value", /* REG_EONIG_BADWC */
"invalid argument", /* REG_EONIG_BADARG */
"multi-thread error" /* REG_EONIG_THREAD */
};
#include <stdio.h>
extern size_t
regerror(int posix_ecode, const regex_t* reg ARG_UNUSED, char* buf,
size_t size)
{
char* s;
char tbuf[35];
size_t len;
if (posix_ecode > 0
&& posix_ecode < (int )(sizeof(ESTRING) / sizeof(ESTRING[0]))) {
s = ESTRING[posix_ecode];
}
else if (posix_ecode == 0) {
s = "";
}
else {
sprintf(tbuf, "undefined error code (%d)", posix_ecode);
s = tbuf;
}
len = strlen(s) + 1; /* use strlen() because s is ascii encoding. */
if (buf != NULL && size > 0) {
strncpy(buf, s, size - 1);
buf[size - 1] = '\0';
}
return len;
}

View File

@@ -1,303 +0,0 @@
/**********************************************************************
regposix.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#define regex_t onig_regex_t
#include "regint.h"
#undef regex_t
#include "onigposix.h"
#define ONIG_C(reg) ((onig_regex_t* )((reg)->onig))
#define PONIG_C(reg) ((onig_regex_t** )(&(reg)->onig))
/* #define ENC_STRING_LEN(enc,s,len) len = strlen(s) */
#define ENC_STRING_LEN(enc,s,len) do { \
if (ONIGENC_MBC_MINLEN(enc) == 1) { \
UChar* tmps = (UChar* )(s); \
while (*tmps != 0) tmps++; \
len = tmps - (UChar* )(s); \
} \
else { \
len = onigenc_str_bytelen_null(enc, (UChar* )s); \
} \
} while(0)
typedef struct {
int onig_err;
int posix_err;
} O2PERR;
static int
onig2posix_error_code(int code)
{
static const O2PERR o2p[] = {
{ ONIG_MISMATCH, REG_NOMATCH },
{ ONIG_NO_SUPPORT_CONFIG, REG_EONIG_INTERNAL },
{ ONIGERR_MEMORY, REG_ESPACE },
{ ONIGERR_MATCH_STACK_LIMIT_OVER, REG_EONIG_INTERNAL },
{ ONIGERR_TYPE_BUG, REG_EONIG_INTERNAL },
{ ONIGERR_PARSER_BUG, REG_EONIG_INTERNAL },
{ ONIGERR_STACK_BUG, REG_EONIG_INTERNAL },
{ ONIGERR_UNDEFINED_BYTECODE, REG_EONIG_INTERNAL },
{ ONIGERR_UNEXPECTED_BYTECODE, REG_EONIG_INTERNAL },
{ ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED, REG_EONIG_BADARG },
{ ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR, REG_EONIG_BADARG },
{ ONIGERR_INVALID_ARGUMENT, REG_EONIG_BADARG },
{ ONIGERR_END_PATTERN_AT_LEFT_BRACE, REG_EBRACE },
{ ONIGERR_END_PATTERN_AT_LEFT_BRACKET, REG_EBRACK },
{ ONIGERR_EMPTY_CHAR_CLASS, REG_ECTYPE },
{ ONIGERR_PREMATURE_END_OF_CHAR_CLASS, REG_ECTYPE },
{ ONIGERR_END_PATTERN_AT_ESCAPE, REG_EESCAPE },
{ ONIGERR_END_PATTERN_AT_META, REG_EESCAPE },
{ ONIGERR_END_PATTERN_AT_CONTROL, REG_EESCAPE },
{ ONIGERR_META_CODE_SYNTAX, REG_BADPAT },
{ ONIGERR_CONTROL_CODE_SYNTAX, REG_BADPAT },
{ ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE, REG_ECTYPE },
{ ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE, REG_ECTYPE },
{ ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS, REG_ECTYPE },
{ ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED, REG_BADRPT },
{ ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID, REG_BADRPT },
{ ONIGERR_NESTED_REPEAT_OPERATOR, REG_BADRPT },
{ ONIGERR_UNMATCHED_CLOSE_PARENTHESIS, REG_EPAREN },
{ ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS, REG_EPAREN },
{ ONIGERR_END_PATTERN_IN_GROUP, REG_BADPAT },
{ ONIGERR_UNDEFINED_GROUP_OPTION, REG_BADPAT },
{ ONIGERR_INVALID_POSIX_BRACKET_TYPE, REG_BADPAT },
{ ONIGERR_INVALID_LOOK_BEHIND_PATTERN, REG_BADPAT },
{ ONIGERR_INVALID_REPEAT_RANGE_PATTERN, REG_BADPAT },
{ ONIGERR_TOO_BIG_NUMBER, REG_BADPAT },
{ ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE, REG_BADBR },
{ ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE, REG_BADBR },
{ ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS, REG_ECTYPE },
{ ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE, REG_ECTYPE },
{ ONIGERR_TOO_MANY_MULTI_BYTE_RANGES, REG_ECTYPE },
{ ONIGERR_TOO_SHORT_MULTI_BYTE_STRING, REG_BADPAT },
{ ONIGERR_TOO_BIG_BACKREF_NUMBER, REG_ESUBREG },
{ ONIGERR_INVALID_BACKREF, REG_ESUBREG },
{ ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT },
{ ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
{ ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
{ ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC },
{ ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT },
{ ONIGERR_INVALID_GROUP_NAME, REG_BADPAT },
{ ONIGERR_INVALID_CHAR_IN_GROUP_NAME, REG_BADPAT },
{ ONIGERR_UNDEFINED_NAME_REFERENCE, REG_BADPAT },
{ ONIGERR_UNDEFINED_GROUP_REFERENCE, REG_BADPAT },
{ ONIGERR_MULTIPLEX_DEFINED_NAME, REG_BADPAT },
{ ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, REG_BADPAT },
{ ONIGERR_NEVER_ENDING_RECURSION, REG_BADPAT },
{ ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY, REG_BADPAT },
{ ONIGERR_INVALID_CHAR_PROPERTY_NAME, REG_BADPAT },
{ ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG },
{ ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT, REG_EONIG_THREAD }
};
int i;
if (code >= 0) return 0;
for (i = 0; i < (int )(sizeof(o2p) / sizeof(o2p[0])); i++) {
if (code == o2p[i].onig_err)
return o2p[i].posix_err;
}
return REG_EONIG_INTERNAL; /* but, unknown error code */
}
extern int
regcomp(regex_t* reg, const char* pattern, int posix_options)
{
int r, len;
OnigSyntaxType* syntax = OnigDefaultSyntax;
OnigOptionType options;
if ((posix_options & REG_EXTENDED) == 0)
syntax = ONIG_SYNTAX_POSIX_BASIC;
options = syntax->options;
if ((posix_options & REG_ICASE) != 0)
ONIG_OPTION_ON(options, ONIG_OPTION_IGNORECASE);
if ((posix_options & REG_NEWLINE) != 0) {
ONIG_OPTION_ON( options, ONIG_OPTION_NEGATE_SINGLELINE);
ONIG_OPTION_OFF(options, ONIG_OPTION_SINGLELINE);
}
reg->comp_options = posix_options;
ENC_STRING_LEN(OnigEncDefaultCharEncoding, pattern, len);
r = onig_new(PONIG_C(reg), (UChar* )pattern, (UChar* )(pattern + len),
options, OnigEncDefaultCharEncoding, syntax,
(OnigErrorInfo* )NULL);
if (r != ONIG_NORMAL) {
return onig2posix_error_code(r);
}
reg->re_nsub = ONIG_C(reg)->num_mem;
return 0;
}
extern int
regexec(regex_t* reg, const char* str, size_t nmatch,
regmatch_t pmatch[], int posix_options)
{
int r, i, len;
UChar* end;
regmatch_t* pm;
OnigOptionType options;
options = ONIG_OPTION_POSIX_REGION;
if ((posix_options & REG_NOTBOL) != 0) options |= ONIG_OPTION_NOTBOL;
if ((posix_options & REG_NOTEOL) != 0) options |= ONIG_OPTION_NOTEOL;
if (nmatch == 0 || (reg->comp_options & REG_NOSUB) != 0) {
pm = (regmatch_t* )NULL;
nmatch = 0;
}
else if ((int )nmatch < ONIG_C(reg)->num_mem + 1) {
pm = (regmatch_t* )xmalloc(sizeof(regmatch_t)
* (ONIG_C(reg)->num_mem + 1));
if (pm == NULL)
return REG_ESPACE;
}
else {
pm = pmatch;
}
ENC_STRING_LEN(ONIG_C(reg)->enc, str, len);
end = (UChar* )(str + len);
r = onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end,
(OnigRegion* )pm, options);
if (r >= 0) {
r = 0; /* Match */
if (pm != pmatch && pm != NULL) {
xmemcpy(pmatch, pm, sizeof(regmatch_t) * nmatch);
}
}
else if (r == ONIG_MISMATCH) {
r = REG_NOMATCH;
for (i = 0; i < (int )nmatch; i++)
pmatch[i].rm_so = pmatch[i].rm_eo = ONIG_REGION_NOTPOS;
}
else {
r = onig2posix_error_code(r);
}
if (pm != pmatch && pm != NULL)
xfree(pm);
#if 0
if (reg->re_nsub > nmatch - 1)
reg->re_nsub = (nmatch <= 1 ? 0 : nmatch - 1);
#endif
return r;
}
extern void
regfree(regex_t* reg)
{
onig_free(ONIG_C(reg));
}
extern void
reg_set_encoding(int mb_code)
{
OnigEncoding enc;
switch (mb_code) {
case REG_POSIX_ENCODING_ASCII:
enc = ONIG_ENCODING_ASCII;
break;
case REG_POSIX_ENCODING_EUC_JP:
enc = ONIG_ENCODING_EUC_JP;
break;
case REG_POSIX_ENCODING_SJIS:
enc = ONIG_ENCODING_SJIS;
break;
case REG_POSIX_ENCODING_UTF8:
enc = ONIG_ENCODING_UTF8;
break;
case REG_POSIX_ENCODING_UTF16_BE:
enc = ONIG_ENCODING_UTF16_BE;
break;
case REG_POSIX_ENCODING_UTF16_LE:
enc = ONIG_ENCODING_UTF16_LE;
break;
default:
return ;
break;
}
onigenc_set_default_encoding(enc);
}
extern int
reg_name_to_group_numbers(regex_t* reg,
const unsigned char* name, const unsigned char* name_end, int** nums)
{
return onig_name_to_group_numbers(ONIG_C(reg), name, name_end, nums);
}
typedef struct {
int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*);
regex_t* reg;
void* arg;
} i_wrap;
static int
i_wrapper(const UChar* name, const UChar* name_end, int ng, int* gs,
onig_regex_t* reg ARG_UNUSED, void* arg)
{
i_wrap* warg = (i_wrap* )arg;
return (*warg->func)(name, name_end, ng, gs, warg->reg, warg->arg);
}
extern int
reg_foreach_name(regex_t* reg,
int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*),
void* arg)
{
i_wrap warg;
warg.func = func;
warg.reg = reg;
warg.arg = arg;
return onig_foreach_name(ONIG_C(reg), i_wrapper, &warg);
}
extern int
reg_number_of_names(regex_t* reg)
{
return onig_number_of_names(ONIG_C(reg));
}

View File

@@ -1,315 +0,0 @@
/**********************************************************************
regsyntax.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
OnigSyntaxType OnigSyntaxASIS = {
0
, ONIG_SYN_OP2_INEFFECTIVE_ESCAPE
, 0
, ONIG_OPTION_NONE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxPosixBasic = {
( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP |
ONIG_SYN_OP_ESC_BRACE_INTERVAL )
, 0
, 0
, ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE )
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxPosixExtended = {
( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_LPAREN_SUBEXP |
ONIG_SYN_OP_BRACE_INTERVAL |
ONIG_SYN_OP_PLUS_ONE_INF | ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_VBAR_ALT )
, 0
, ( ONIG_SYN_CONTEXT_INDEP_ANCHORS |
ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS |
ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP |
ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC )
, ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE )
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxEmacs = {
( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC |
ONIG_SYN_OP_ESC_BRACE_INTERVAL |
ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_VBAR_ALT |
ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF |
ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_DECIMAL_BACKREF |
ONIG_SYN_OP_LINE_ANCHOR | ONIG_SYN_OP_ESC_CONTROL_CHARS )
, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR
, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC
, ONIG_OPTION_NONE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxGrep = {
( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_POSIX_BRACKET |
ONIG_SYN_OP_ESC_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP |
ONIG_SYN_OP_ESC_VBAR_ALT |
ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_ESC_PLUS_ONE_INF |
ONIG_SYN_OP_ESC_QMARK_ZERO_ONE | ONIG_SYN_OP_LINE_ANCHOR |
ONIG_SYN_OP_ESC_W_WORD | ONIG_SYN_OP_ESC_B_WORD_BOUND |
ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | ONIG_SYN_OP_DECIMAL_BACKREF )
, 0
, ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC )
, ONIG_OPTION_NONE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxGnuRegex = {
SYN_GNU_REGEX_OP
, 0
, SYN_GNU_REGEX_BV
, ONIG_OPTION_NONE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxJava = {
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL |
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 )
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
, ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
ONIG_SYN_OP2_OPTION_PERL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL | ONIG_SYN_OP2_CCLASS_SET_OP |
ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 |
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY )
, ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND )
, ONIG_OPTION_SINGLELINE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
OnigSyntaxType OnigSyntaxPerl = {
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
ONIG_SYN_OP_ESC_C_CONTROL )
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
, ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE |
ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT )
, SYN_GNU_REGEX_BV
, ONIG_OPTION_SINGLELINE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
/* Perl + named group */
OnigSyntaxType OnigSyntaxPerl_NG = {
(( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
ONIG_SYN_OP_ESC_C_CONTROL )
& ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
, ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE |
ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP |
ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
ONIG_SYN_OP2_ESC_G_SUBEXP_CALL )
, ( SYN_GNU_REGEX_BV |
ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME )
, ONIG_OPTION_SINGLELINE
,
{
(OnigCodePoint )'\\' /* esc */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
, (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
}
};
extern int
onig_set_default_syntax(OnigSyntaxType* syntax)
{
if (IS_NULL(syntax))
syntax = ONIG_SYNTAX_RUBY;
OnigDefaultSyntax = syntax;
return 0;
}
extern void
onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from)
{
*to = *from;
}
extern void
onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op)
{
syntax->op = op;
}
extern void
onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2)
{
syntax->op2 = op2;
}
extern void
onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior)
{
syntax->behavior = behavior;
}
extern void
onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options)
{
syntax->options = options;
}
extern unsigned int
onig_get_syntax_op(OnigSyntaxType* syntax)
{
return syntax->op;
}
extern unsigned int
onig_get_syntax_op2(OnigSyntaxType* syntax)
{
return syntax->op2;
}
extern unsigned int
onig_get_syntax_behavior(OnigSyntaxType* syntax)
{
return syntax->behavior;
}
extern OnigOptionType
onig_get_syntax_options(OnigSyntaxType* syntax)
{
return syntax->options;
}
#ifdef USE_VARIABLE_META_CHARS
extern int onig_set_meta_char(OnigSyntaxType* enc,
unsigned int what, OnigCodePoint code)
{
switch (what) {
case ONIG_META_CHAR_ESCAPE:
enc->meta_char_table.esc = code;
break;
case ONIG_META_CHAR_ANYCHAR:
enc->meta_char_table.anychar = code;
break;
case ONIG_META_CHAR_ANYTIME:
enc->meta_char_table.anytime = code;
break;
case ONIG_META_CHAR_ZERO_OR_ONE_TIME:
enc->meta_char_table.zero_or_one_time = code;
break;
case ONIG_META_CHAR_ONE_OR_MORE_TIME:
enc->meta_char_table.one_or_more_time = code;
break;
case ONIG_META_CHAR_ANYCHAR_ANYTIME:
enc->meta_char_table.anychar_anytime = code;
break;
default:
return ONIGERR_INVALID_ARGUMENT;
break;
}
return 0;
}
#endif /* USE_VARIABLE_META_CHARS */

View File

@@ -1,76 +0,0 @@
/**********************************************************************
regtrav.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2004 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regint.h"
#ifdef USE_CAPTURE_HISTORY
static int
capture_tree_traverse(OnigCaptureTreeNode* node, int at,
int(*callback_func)(int,int,int,int,int,void*),
int level, void* arg)
{
int r, i;
if (node == (OnigCaptureTreeNode* )0)
return 0;
if ((at & ONIG_TRAVERSE_CALLBACK_AT_FIRST) != 0) {
r = (*callback_func)(node->group, node->beg, node->end,
level, ONIG_TRAVERSE_CALLBACK_AT_FIRST, arg);
if (r != 0) return r;
}
for (i = 0; i < node->num_childs; i++) {
r = capture_tree_traverse(node->childs[i], at,
callback_func, level + 1, arg);
if (r != 0) return r;
}
if ((at & ONIG_TRAVERSE_CALLBACK_AT_LAST) != 0) {
r = (*callback_func)(node->group, node->beg, node->end,
level, ONIG_TRAVERSE_CALLBACK_AT_LAST, arg);
if (r != 0) return r;
}
return 0;
}
#endif /* USE_CAPTURE_HISTORY */
extern int
onig_capture_tree_traverse(OnigRegion* region, int at,
int(*callback_func)(int,int,int,int,int,void*), void* arg)
{
#ifdef USE_CAPTURE_HISTORY
return capture_tree_traverse(region->history_root, at,
callback_func, 0, arg);
#else
return ONIG_NO_SUPPORT_CONFIG;
#endif
}

View File

@@ -1,56 +0,0 @@
/**********************************************************************
regversion.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "onig_config.h"
#include "oniguruma.h"
#include <stdio.h>
extern const char*
onig_version(void)
{
static char s[12];
sprintf(s, "%d.%d.%d",
ONIGURUMA_VERSION_MAJOR,
ONIGURUMA_VERSION_MINOR,
ONIGURUMA_VERSION_TEENY);
return s;
}
extern const char*
onig_copyright(void)
{
static char s[58];
sprintf(s, "Oniguruma %d.%d.%d : Copyright (C) 2002-2008 K.Kosako",
ONIGURUMA_VERSION_MAJOR,
ONIGURUMA_VERSION_MINOR,
ONIGURUMA_VERSION_TEENY);
return s;
}

View File

@@ -1,578 +0,0 @@
/* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */
/* static char sccsid[] = "@(#) st.c 5.1 89/12/14 Crucible"; */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#include <malloc.h>
#endif
#include "regint.h"
#include "st.h"
typedef struct st_table_entry st_table_entry;
struct st_table_entry {
unsigned int hash;
st_data_t key;
st_data_t record;
st_table_entry *next;
};
#define ST_DEFAULT_MAX_DENSITY 5
#define ST_DEFAULT_INIT_TABLE_SIZE 11
/*
* DEFAULT_MAX_DENSITY is the default for the largest we allow the
* average number of items per bin before increasing the number of
* bins
*
* DEFAULT_INIT_TABLE_SIZE is the default for the number of bins
* allocated initially
*
*/
static int numcmp(long, long);
static int numhash(long);
static struct st_hash_type type_numhash = {
numcmp,
numhash,
};
/* extern int strcmp(const char *, const char *); */
static int strhash(const char *);
static struct st_hash_type type_strhash = {
strcmp,
strhash,
};
static void rehash(st_table *);
#define alloc(type) (type*)xmalloc((unsigned)sizeof(type))
#define Calloc(n,s) (char*)xcalloc((n),(s))
#define EQUAL(table,x,y) ((x)==(y) || (*table->type->compare)((x),(y)) == 0)
#define do_hash(key,table) (unsigned int)(*(table)->type->hash)((key))
#define do_hash_bin(key,table) (do_hash(key, table)%(table)->num_bins)
/*
* MINSIZE is the minimum size of a dictionary.
*/
#define MINSIZE 8
/*
Table of prime numbers 2^n+a, 2<=n<=30.
*/
static const long primes[] = {
8 + 3,
16 + 3,
32 + 5,
64 + 3,
128 + 3,
256 + 27,
512 + 9,
1024 + 9,
2048 + 5,
4096 + 3,
8192 + 27,
16384 + 43,
32768 + 3,
65536 + 45,
131072 + 29,
262144 + 3,
524288 + 21,
1048576 + 7,
2097152 + 17,
4194304 + 15,
8388608 + 9,
16777216 + 43,
33554432 + 35,
67108864 + 15,
134217728 + 29,
268435456 + 3,
536870912 + 11,
1073741824 + 85,
0
};
static int
new_size(size)
int size;
{
int i;
#if 0
for (i=3; i<31; i++) {
if ((1<<i) > size) return 1<<i;
}
return -1;
#else
int newsize;
for (i = 0, newsize = MINSIZE;
i < (int )(sizeof(primes)/sizeof(primes[0]));
i++, newsize <<= 1)
{
if (newsize > size) return primes[i];
}
/* Ran out of polynomials */
return -1; /* should raise exception */
#endif
}
#ifdef HASH_LOG
static int collision = 0;
static int init_st = 0;
static void
stat_col()
{
FILE *f = fopen("/tmp/col", "w");
fprintf(f, "collision: %d\n", collision);
fclose(f);
}
#endif
st_table*
st_init_table_with_size(type, size)
struct st_hash_type *type;
int size;
{
st_table *tbl;
#ifdef HASH_LOG
if (init_st == 0) {
init_st = 1;
atexit(stat_col);
}
#endif
size = new_size(size); /* round up to prime number */
tbl = alloc(st_table);
tbl->type = type;
tbl->num_entries = 0;
tbl->num_bins = size;
tbl->bins = (st_table_entry **)Calloc(size, sizeof(st_table_entry*));
return tbl;
}
st_table*
st_init_table(type)
struct st_hash_type *type;
{
return st_init_table_with_size(type, 0);
}
st_table*
st_init_numtable(void)
{
return st_init_table(&type_numhash);
}
st_table*
st_init_numtable_with_size(size)
int size;
{
return st_init_table_with_size(&type_numhash, size);
}
st_table*
st_init_strtable(void)
{
return st_init_table(&type_strhash);
}
st_table*
st_init_strtable_with_size(size)
int size;
{
return st_init_table_with_size(&type_strhash, size);
}
void
st_free_table(table)
st_table *table;
{
register st_table_entry *ptr, *next;
int i;
for(i = 0; i < table->num_bins; i++) {
ptr = table->bins[i];
while (ptr != 0) {
next = ptr->next;
free(ptr);
ptr = next;
}
}
free(table->bins);
free(table);
}
#define PTR_NOT_EQUAL(table, ptr, hash_val, key) \
((ptr) != 0 && (ptr->hash != (hash_val) || !EQUAL((table), (key), (ptr)->key)))
#ifdef HASH_LOG
#define COLLISION collision++
#else
#define COLLISION
#endif
#define FIND_ENTRY(table, ptr, hash_val, bin_pos) do {\
bin_pos = hash_val%(table)->num_bins;\
ptr = (table)->bins[bin_pos];\
if (PTR_NOT_EQUAL(table, ptr, hash_val, key)) {\
COLLISION;\
while (PTR_NOT_EQUAL(table, ptr->next, hash_val, key)) {\
ptr = ptr->next;\
}\
ptr = ptr->next;\
}\
} while (0)
int
st_lookup(table, key, value)
st_table *table;
register st_data_t key;
st_data_t *value;
{
unsigned int hash_val, bin_pos;
register st_table_entry *ptr;
hash_val = do_hash(key, table);
FIND_ENTRY(table, ptr, hash_val, bin_pos);
if (ptr == 0) {
return 0;
}
else {
if (value != 0) *value = ptr->record;
return 1;
}
}
#define ADD_DIRECT(table, key, value, hash_val, bin_pos)\
do {\
st_table_entry *entry;\
if (table->num_entries/(table->num_bins) > ST_DEFAULT_MAX_DENSITY) {\
rehash(table);\
bin_pos = hash_val % table->num_bins;\
}\
\
entry = alloc(st_table_entry);\
\
entry->hash = hash_val;\
entry->key = key;\
entry->record = value;\
entry->next = table->bins[bin_pos];\
table->bins[bin_pos] = entry;\
table->num_entries++;\
} while (0)
int
st_insert(table, key, value)
register st_table *table;
register st_data_t key;
st_data_t value;
{
unsigned int hash_val, bin_pos;
register st_table_entry *ptr;
hash_val = do_hash(key, table);
FIND_ENTRY(table, ptr, hash_val, bin_pos);
if (ptr == 0) {
ADD_DIRECT(table, key, value, hash_val, bin_pos);
return 0;
}
else {
ptr->record = value;
return 1;
}
}
void
st_add_direct(table, key, value)
st_table *table;
st_data_t key;
st_data_t value;
{
unsigned int hash_val, bin_pos;
hash_val = do_hash(key, table);
bin_pos = hash_val % table->num_bins;
ADD_DIRECT(table, key, value, hash_val, bin_pos);
}
static void
rehash(table)
register st_table *table;
{
register st_table_entry *ptr, *next, **new_bins;
int i, old_num_bins = table->num_bins, new_num_bins;
unsigned int hash_val;
new_num_bins = new_size(old_num_bins+1);
new_bins = (st_table_entry**)Calloc(new_num_bins, sizeof(st_table_entry*));
for(i = 0; i < old_num_bins; i++) {
ptr = table->bins[i];
while (ptr != 0) {
next = ptr->next;
hash_val = ptr->hash % new_num_bins;
ptr->next = new_bins[hash_val];
new_bins[hash_val] = ptr;
ptr = next;
}
}
free(table->bins);
table->num_bins = new_num_bins;
table->bins = new_bins;
}
st_table*
st_copy(old_table)
st_table *old_table;
{
st_table *new_table;
st_table_entry *ptr, *entry;
int i, num_bins = old_table->num_bins;
new_table = alloc(st_table);
if (new_table == 0) {
return 0;
}
*new_table = *old_table;
new_table->bins = (st_table_entry**)
Calloc((unsigned)num_bins, sizeof(st_table_entry*));
if (new_table->bins == 0) {
free(new_table);
return 0;
}
for(i = 0; i < num_bins; i++) {
new_table->bins[i] = 0;
ptr = old_table->bins[i];
while (ptr != 0) {
entry = alloc(st_table_entry);
if (entry == 0) {
free(new_table->bins);
free(new_table);
return 0;
}
*entry = *ptr;
entry->next = new_table->bins[i];
new_table->bins[i] = entry;
ptr = ptr->next;
}
}
return new_table;
}
int
st_delete(table, key, value)
register st_table *table;
register st_data_t *key;
st_data_t *value;
{
unsigned int hash_val;
st_table_entry *tmp;
register st_table_entry *ptr;
hash_val = do_hash_bin(*key, table);
ptr = table->bins[hash_val];
if (ptr == 0) {
if (value != 0) *value = 0;
return 0;
}
if (EQUAL(table, *key, ptr->key)) {
table->bins[hash_val] = ptr->next;
table->num_entries--;
if (value != 0) *value = ptr->record;
*key = ptr->key;
free(ptr);
return 1;
}
for(; ptr->next != 0; ptr = ptr->next) {
if (EQUAL(table, ptr->next->key, *key)) {
tmp = ptr->next;
ptr->next = ptr->next->next;
table->num_entries--;
if (value != 0) *value = tmp->record;
*key = tmp->key;
free(tmp);
return 1;
}
}
return 0;
}
int
st_delete_safe(table, key, value, never)
register st_table *table;
register st_data_t *key;
st_data_t *value;
st_data_t never;
{
unsigned int hash_val;
register st_table_entry *ptr;
hash_val = do_hash_bin(*key, table);
ptr = table->bins[hash_val];
if (ptr == 0) {
if (value != 0) *value = 0;
return 0;
}
for(; ptr != 0; ptr = ptr->next) {
if ((ptr->key != never) && EQUAL(table, ptr->key, *key)) {
table->num_entries--;
*key = ptr->key;
if (value != 0) *value = ptr->record;
ptr->key = ptr->record = never;
return 1;
}
}
return 0;
}
static int
#if defined(__GNUC__)
delete_never(st_data_t key __attribute__ ((unused)), st_data_t value,
st_data_t never)
#else
delete_never(key, value, never)
st_data_t key, value, never;
#endif
{
if (value == never) return ST_DELETE;
return ST_CONTINUE;
}
void
st_cleanup_safe(table, never)
st_table *table;
st_data_t never;
{
int num_entries = table->num_entries;
st_foreach(table, delete_never, never);
table->num_entries = num_entries;
}
int
st_foreach(table, func, arg)
st_table *table;
int (*func)();
st_data_t arg;
{
st_table_entry *ptr, *last, *tmp;
enum st_retval retval;
int i;
for(i = 0; i < table->num_bins; i++) {
last = 0;
for(ptr = table->bins[i]; ptr != 0;) {
retval = (*func)(ptr->key, ptr->record, arg);
switch (retval) {
case ST_CHECK: /* check if hash is modified during iteration */
tmp = 0;
if (i < table->num_bins) {
for (tmp = table->bins[i]; tmp; tmp=tmp->next) {
if (tmp == ptr) break;
}
}
if (!tmp) {
/* call func with error notice */
return 1;
}
/* fall through */
case ST_CONTINUE:
last = ptr;
ptr = ptr->next;
break;
case ST_STOP:
return 0;
case ST_DELETE:
tmp = ptr;
if (last == 0) {
table->bins[i] = ptr->next;
}
else {
last->next = ptr->next;
}
ptr = ptr->next;
free(tmp);
table->num_entries--;
}
}
}
return 0;
}
static int
strhash(string)
register const char *string;
{
register int c;
#ifdef HASH_ELFHASH
register unsigned int h = 0, g;
while ((c = *string++) != '\0') {
h = ( h << 4 ) + c;
if ( g = h & 0xF0000000 )
h ^= g >> 24;
h &= ~g;
}
return h;
#elif HASH_PERL
register int val = 0;
while ((c = *string++) != '\0') {
val += c;
val += (val << 10);
val ^= (val >> 6);
}
val += (val << 3);
val ^= (val >> 11);
return val + (val << 15);
#else
register int val = 0;
while ((c = *string++) != '\0') {
val = val*997 + c;
}
return val + (val>>5);
#endif
}
static int
numcmp(x, y)
long x, y;
{
return x != y;
}
static int
numhash(n)
long n;
{
return n;
}

View File

@@ -1,68 +0,0 @@
/* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */
/* @(#) st.h 5.1 89/12/14 */
#ifndef ST_INCLUDED
#define ST_INCLUDED
#ifdef _WIN32
# include <windows.h>
typedef ULONG_PTR st_data_t;
#else
typedef unsigned long st_data_t;
#endif
#define ST_DATA_T_DEFINED
typedef struct st_table st_table;
struct st_hash_type {
int (*compare)();
int (*hash)();
};
struct st_table {
struct st_hash_type *type;
int num_bins;
int num_entries;
struct st_table_entry **bins;
};
#define st_is_member(table,key) st_lookup(table,key,(st_data_t *)0)
enum st_retval {ST_CONTINUE, ST_STOP, ST_DELETE, ST_CHECK};
#ifndef _
# define _(args) args
#endif
#ifndef ANYARGS
# ifdef __cplusplus
# define ANYARGS ...
# else
# define ANYARGS
# endif
#endif
st_table *st_init_table _((struct st_hash_type *));
st_table *st_init_table_with_size _((struct st_hash_type *, int));
st_table *st_init_numtable _((void));
st_table *st_init_numtable_with_size _((int));
st_table *st_init_strtable _((void));
st_table *st_init_strtable_with_size _((int));
int st_delete _((st_table *, st_data_t *, st_data_t *));
int st_delete_safe _((st_table *, st_data_t *, st_data_t *, st_data_t));
int st_insert _((st_table *, st_data_t, st_data_t));
int st_lookup _((st_table *, st_data_t, st_data_t *));
int st_foreach _((st_table *, int (*)(ANYARGS), st_data_t));
void st_add_direct _((st_table *, st_data_t, st_data_t));
void st_free_table _((st_table *));
void st_cleanup_safe _((st_table *, st_data_t));
st_table *st_copy _((st_table *));
#define ST_NUMCMP ((int (*)()) 0)
#define ST_NUMHASH ((int (*)()) -2)
#define st_numcmp ST_NUMCMP
#define st_numhash ST_NUMHASH
#endif /* ST_INCLUDED */

File diff suppressed because it is too large Load Diff

View File

@@ -1,305 +0,0 @@
/**********************************************************************
utf8.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regenc.h"
#define USE_INVALID_CODE_SCHEME
#ifdef USE_INVALID_CODE_SCHEME
/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
#define INVALID_CODE_FE 0xfffffffe
#define INVALID_CODE_FF 0xffffffff
#define VALID_CODE_LIMIT 0x7fffffff
#endif
#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
static const int EncLen_UTF8[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
};
static int
mbc_enc_len(const UChar* p)
{
return EncLen_UTF8[*p];
}
static int
is_mbc_newline(const UChar* p, const UChar* end)
{
if (p < end) {
if (*p == 0x0a) return 1;
#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
#ifndef USE_CRNL_AS_LINE_TERMINATOR
if (*p == 0x0d) return 1;
#endif
if (p + 1 < end) {
if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
return 1;
if (p + 2 < end) {
if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
&& *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
return 1;
}
}
#endif
}
return 0;
}
static OnigCodePoint
mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
{
int c, len;
OnigCodePoint n;
len = enclen(ONIG_ENCODING_UTF8, p);
c = *p++;
if (len > 1) {
len--;
n = c & ((1 << (6 - len)) - 1);
while (len--) {
c = *p++;
n = (n << 6) | (c & ((1 << 6) - 1));
}
return n;
}
else {
#ifdef USE_INVALID_CODE_SCHEME
if (c > 0xfd) {
return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
}
#endif
return (OnigCodePoint )c;
}
}
static int
code_to_mbclen(OnigCodePoint code)
{
if ((code & 0xffffff80) == 0) return 1;
else if ((code & 0xfffff800) == 0) return 2;
else if ((code & 0xffff0000) == 0) return 3;
else if ((code & 0xffe00000) == 0) return 4;
else if ((code & 0xfc000000) == 0) return 5;
else if ((code & 0x80000000) == 0) return 6;
#ifdef USE_INVALID_CODE_SCHEME
else if (code == INVALID_CODE_FE) return 1;
else if (code == INVALID_CODE_FF) return 1;
#endif
else
return ONIGERR_INVALID_CODE_POINT_VALUE;
}
static int
code_to_mbc(OnigCodePoint code, UChar *buf)
{
#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80)
if ((code & 0xffffff80) == 0) {
*buf = (UChar )code;
return 1;
}
else {
UChar *p = buf;
if ((code & 0xfffff800) == 0) {
*p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
}
else if ((code & 0xffff0000) == 0) {
*p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
*p++ = UTF8_TRAILS(code, 6);
}
else if ((code & 0xffe00000) == 0) {
*p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
*p++ = UTF8_TRAILS(code, 12);
*p++ = UTF8_TRAILS(code, 6);
}
else if ((code & 0xfc000000) == 0) {
*p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
*p++ = UTF8_TRAILS(code, 18);
*p++ = UTF8_TRAILS(code, 12);
*p++ = UTF8_TRAILS(code, 6);
}
else if ((code & 0x80000000) == 0) {
*p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
*p++ = UTF8_TRAILS(code, 24);
*p++ = UTF8_TRAILS(code, 18);
*p++ = UTF8_TRAILS(code, 12);
*p++ = UTF8_TRAILS(code, 6);
}
#ifdef USE_INVALID_CODE_SCHEME
else if (code == INVALID_CODE_FE) {
*p = 0xfe;
return 1;
}
else if (code == INVALID_CODE_FF) {
*p = 0xff;
return 1;
}
#endif
else {
return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
}
*p++ = UTF8_TRAIL0(code);
return p - buf;
}
}
static int
mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
const UChar* end, UChar* fold)
{
const UChar* p = *pp;
if (ONIGENC_IS_MBC_ASCII(p)) {
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
if (*p == 0x49) {
*fold++ = 0xc4;
*fold = 0xb1;
(*pp)++;
return 2;
}
}
#endif
*fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
(*pp)++;
return 1; /* return byte length of converted char to lower */
}
else {
return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
pp, end, fold);
}
}
#if 0
static int
is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
{
const UChar* p = *pp;
if (ONIGENC_IS_MBC_ASCII(p)) {
(*pp)++;
return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
}
else {
(*pp) += enclen(ONIG_ENCODING_UTF8, p);
if (*p == 0xc3) {
int c = *(p + 1);
if (c >= 0x80) {
if (c <= (UChar )0x9e) { /* upper */
if (c == (UChar )0x97) return FALSE;
return TRUE;
}
else if (c >= (UChar )0xa0 && c <= (UChar )0xbe) { /* lower */
if (c == (UChar )'\267') return FALSE;
return TRUE;
}
else if (c == (UChar )0x9f &&
(flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
return TRUE;
}
}
}
}
return FALSE;
}
#endif
static int
get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
const OnigCodePoint* ranges[])
{
*sb_out = 0x80;
return onigenc_unicode_ctype_code_range(ctype, ranges);
}
static UChar*
left_adjust_char_head(const UChar* start, const UChar* s)
{
const UChar *p;
if (s <= start) return (UChar* )s;
p = s;
while (!utf8_islead(*p) && p > start) p--;
return (UChar* )p;
}
static int
get_case_fold_codes_by_str(OnigCaseFoldType flag,
const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
{
return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
flag, p, end, items);
}
OnigEncodingType OnigEncodingUTF8 = {
mbc_enc_len,
"UTF-8", /* name */
6, /* max byte length */
1, /* min byte length */
is_mbc_newline,
mbc_to_code,
code_to_mbclen,
code_to_mbc,
mbc_case_fold,
onigenc_unicode_apply_all_case_fold,
get_case_fold_codes_by_str,
onigenc_unicode_property_name_to_ctype,
onigenc_unicode_is_code_ctype,
get_ctype_code_range,
left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match
};

View File

@@ -1,84 +0,0 @@
#define STDC_HEADERS 1
#define HAVE_SYS_TYPES_H 1
#define HAVE_SYS_STAT_H 1
#define HAVE_STDLIB_H 1
#define HAVE_STRING_H 1
#define HAVE_MEMORY_H 1
#define HAVE_FLOAT_H 1
#define HAVE_OFF_T 1
#define SIZEOF_INT 4
#define SIZEOF_SHORT 2
#define SIZEOF_LONG 4
#define SIZEOF_LONG_LONG 0
#define SIZEOF___INT64 8
#define SIZEOF_OFF_T 4
#define SIZEOF_VOIDP 4
#define SIZEOF_FLOAT 4
#define SIZEOF_DOUBLE 8
#define HAVE_PROTOTYPES 1
#define TOKEN_PASTE(x,y) x##y
#define HAVE_STDARG_PROTOTYPES 1
#ifndef NORETURN
#if _MSC_VER > 1100
#define NORETURN(x) __declspec(noreturn) x
#else
#define NORETURN(x) x
#endif
#endif
#define HAVE_DECL_SYS_NERR 1
#define STDC_HEADERS 1
#define HAVE_STDLIB_H 1
#define HAVE_STRING_H 1
#define HAVE_LIMITS_H 1
#define HAVE_FCNTL_H 1
#define HAVE_SYS_UTIME_H 1
#define HAVE_MEMORY_H 1
#define uid_t int
#define gid_t int
#define HAVE_STRUCT_STAT_ST_RDEV 1
#define HAVE_ST_RDEV 1
#define GETGROUPS_T int
#define RETSIGTYPE void
#define HAVE_ALLOCA 1
#define HAVE_DUP2 1
#define HAVE_MEMCMP 1
#define HAVE_MEMMOVE 1
#define HAVE_MKDIR 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRERROR 1
#define HAVE_STRFTIME 1
#define HAVE_STRCHR 1
#define HAVE_STRSTR 1
#define HAVE_STRTOD 1
#define HAVE_STRTOL 1
#define HAVE_STRTOUL 1
#define HAVE_FLOCK 1
#define HAVE_VSNPRINTF 1
#define HAVE_FINITE 1
#define HAVE_FMOD 1
#define HAVE_FREXP 1
#define HAVE_HYPOT 1
#define HAVE_MODF 1
#define HAVE_WAITPID 1
#define HAVE_CHSIZE 1
#define HAVE_TIMES 1
#define HAVE__SETJMP 1
#define HAVE_TELLDIR 1
#define HAVE_SEEKDIR 1
#define HAVE_MKTIME 1
#define HAVE_COSH 1
#define HAVE_SINH 1
#define HAVE_TANH 1
#define HAVE_EXECVE 1
#define HAVE_TZNAME 1
#define HAVE_DAYLIGHT 1
#define SETPGRP_VOID 1
#define inline __inline
#define NEED_IO_SEEK_BETWEEN_RW 1
#define RSHIFT(x,y) ((x)>>(int)y)
#define FILE_COUNT _cnt
#define FILE_READPTR _ptr
#define DEFAULT_KCODE KCODE_NONE
#define DLEXT ".so"
#define DLEXT2 ".dll"