#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <Catalog.h>
#include <Errors.h>
#include "RegExp.h"
#undef B_TRANSLATION_CONTEXT
#define B_TRANSLATION_CONTEXT "libtracker"
const uint8 kRegExpMagic = 0234;
enum {
kRegExpEnd = 0,
kRegExpBol = 1,
kRegExpEol = 2,
kRegExpAny = 3,
kRegExpAnyOf = 4,
kRegExpAnyBut = 5,
kRegExpBranch = 6,
kRegExpBack = 7,
kRegExpExactly = 8,
kRegExpNothing = 9,
kRegExpStar = 10,
kRegExpPlus = 11,
kRegExpOpen = 20,
kRegExpClose = 30
};
const char* kMeta = "^$.[()|?+*\\";
const int32 kMaxSize = 32767L;
enum {
kHasWidth = 01,
kSimple = 02,
kSPStart = 04,
kWorst = 0
};
const char* kRegExpErrorStringArray[] = {
B_TRANSLATE_MARK("Unmatched parenthesis."),
B_TRANSLATE_MARK("Expression too long."),
B_TRANSLATE_MARK("Too many parenthesis."),
B_TRANSLATE_MARK("Junk on end."),
B_TRANSLATE_MARK("*+? operand may be empty."),
B_TRANSLATE_MARK("Nested *?+."),
B_TRANSLATE_MARK("Invalid bracket range."),
B_TRANSLATE_MARK("Unmatched brackets."),
B_TRANSLATE_MARK("Internal error."),
B_TRANSLATE_MARK("?+* follows nothing."),
B_TRANSLATE_MARK("Trailing \\."),
B_TRANSLATE_MARK("Corrupted expression."),
B_TRANSLATE_MARK("Memory corruption."),
B_TRANSLATE_MARK("Corrupted pointers."),
B_TRANSLATE_MARK("Corrupted opcode.")
};
#ifdef DEBUG
int32 regnarrate = 0;
#endif
RegExp::RegExp()
:
fError(B_OK),
fRegExp(NULL),
fInputScanPointer(NULL),
fParenthesisCount(0),
fDummy('\0'),
fCodeEmitPointer(NULL),
fCodeSize(0),
fStringInputPointer(NULL),
fRegBol(NULL),
fStartPArrayPointer(NULL),
fEndPArrayPointer(NULL)
{
}
RegExp::RegExp(const char* pattern)
:
fError(B_OK),
fRegExp(NULL)
{
fRegExp = Compile(pattern);
}
RegExp::RegExp(const BString& pattern)
:
fError(B_OK),
fRegExp(NULL)
{
fRegExp = Compile(pattern.String());
}
RegExp::~RegExp()
{
free(fRegExp);
}
status_t
RegExp::InitCheck() const
{
return fError;
}
status_t
RegExp::SetTo(const char* pattern)
{
fError = B_OK;
free(fRegExp);
fRegExp = Compile(pattern);
return fError;
}
status_t
RegExp::SetTo(const BString& pattern)
{
fError = B_OK;
free(fRegExp);
fRegExp = Compile(pattern.String());
return fError;
}
bool
RegExp::Matches(const char* string) const
{
if (fRegExp == NULL || string == NULL)
return false;
return RunMatcher(fRegExp, string) == 1;
}
bool
RegExp::Matches(const BString& string) const
{
if (fRegExp == NULL)
return false;
return RunMatcher(fRegExp, string.String()) == 1;
}
regexp*
RegExp::Compile(const char* exp)
{
regexp* r;
const char* scan;
const char* longest;
int32 len;
int32 flags;
if (exp == NULL) {
SetError(B_BAD_VALUE);
return NULL;
}
fInputScanPointer = exp;
fParenthesisCount = 1;
fCodeSize = 0L;
fCodeEmitPointer = &fDummy;
Char(kRegExpMagic);
if (Reg(0, &flags) == NULL)
return NULL;
if (fCodeSize >= kMaxSize) {
SetError(REGEXP_TOO_BIG);
return NULL;
}
r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
if (r == NULL) {
SetError(B_NO_MEMORY);
return NULL;
}
fInputScanPointer = exp;
fParenthesisCount = 1;
fCodeEmitPointer = r->program;
Char(kRegExpMagic);
if (Reg(0, &flags) == NULL) {
free(r);
return NULL;
}
r->regstart = '\0';
r->reganch = 0;
r->regmust = NULL;
r->regmlen = 0;
scan = r->program + 1;
if (*Next((char*)scan) == kRegExpEnd) {
scan = Operand(scan);
if (*scan == kRegExpExactly)
r->regstart = *Operand(scan);
else if (*scan == kRegExpBol)
r->reganch++;
if ((flags & kSPStart) != 0) {
longest = NULL;
len = 0;
for (; scan != NULL; scan = Next((char*)scan)) {
if (*scan == kRegExpExactly
&& (int32)strlen(Operand(scan)) >= len) {
longest = Operand(scan);
len = (int32)strlen(Operand(scan));
}
}
r->regmust = longest;
r->regmlen = len;
}
}
return r;
}
regexp*
RegExp::Expression() const
{
return fRegExp;
}
const char*
RegExp::ErrorString() const
{
if (fError >= REGEXP_UNMATCHED_PARENTHESIS
&& fError <= REGEXP_CORRUPTED_OPCODE) {
return B_TRANSLATE_NOCOLLECT(
kRegExpErrorStringArray[fError - B_ERRORS_END]);
}
return strerror(fError);
}
void
RegExp::SetError(status_t error) const
{
fError = error;
}
char*
RegExp::Reg(int32 paren, int32* flagp)
{
char* ret;
char* br;
char* ender;
int32 parno = 0;
int32 flags;
*flagp = kHasWidth;
if (paren) {
if (fParenthesisCount >= kSubExpressionMax) {
SetError(REGEXP_TOO_MANY_PARENTHESIS);
return NULL;
}
parno = fParenthesisCount;
fParenthesisCount++;
ret = Node((char)(kRegExpOpen + parno));
} else
ret = NULL;
br = Branch(&flags);
if (br == NULL)
return NULL;
if (ret != NULL) {
Tail(ret, br);
} else
ret = br;
if (!(flags & kHasWidth))
*flagp &= ~kHasWidth;
*flagp |= flags&kSPStart;
while (*fInputScanPointer == '|') {
fInputScanPointer++;
br = Branch(&flags);
if (br == NULL)
return NULL;
Tail(ret, br);
if (!(flags & kHasWidth))
*flagp &= ~kHasWidth;
*flagp |= flags&kSPStart;
}
ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
Tail(ret, ender);
for (br = ret; br != NULL; br = Next(br))
OpTail(br, ender);
if (paren && *fInputScanPointer++ != ')') {
SetError(REGEXP_UNMATCHED_PARENTHESIS);
return NULL;
} else if (!paren && *fInputScanPointer != '\0') {
if (*fInputScanPointer == ')') {
SetError(REGEXP_UNMATCHED_PARENTHESIS);
return NULL;
} else {
SetError(REGEXP_JUNK_ON_END);
return NULL;
}
}
return ret;
}
char*
RegExp::Branch(int32* flagp)
{
char* ret;
char* chain;
char* latest;
int32 flags;
*flagp = kWorst;
ret = Node(kRegExpBranch);
chain = NULL;
while (*fInputScanPointer != '\0'
&& *fInputScanPointer != '|'
&& *fInputScanPointer != ')') {
latest = Piece(&flags);
if (latest == NULL)
return NULL;
*flagp |= flags & kHasWidth;
if (chain == NULL) {
*flagp |= flags & kSPStart;
} else
Tail(chain, latest);
chain = latest;
}
if (chain == NULL) {
Node(kRegExpNothing);
}
return ret;
}
char*
RegExp::Piece(int32* flagp)
{
char* ret;
char op;
char* next;
int32 flags;
ret = Atom(&flags);
if (ret == NULL)
return NULL;
op = *fInputScanPointer;
if (!IsMult(op)) {
*flagp = flags;
return ret;
}
if (!(flags & kHasWidth) && op != '?') {
SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
return NULL;
}
*flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth;
if (op == '*' && (flags & kSimple))
Insert(kRegExpStar, ret);
else if (op == '*') {
Insert(kRegExpBranch, ret);
OpTail(ret, Node(kRegExpBack));
OpTail(ret, ret);
Tail(ret, Node(kRegExpBranch));
Tail(ret, Node(kRegExpNothing));
} else if (op == '+' && (flags & kSimple))
Insert(kRegExpPlus, ret);
else if (op == '+') {
next = Node(kRegExpBranch);
Tail(ret, next);
Tail(Node(kRegExpBack), ret);
Tail(next, Node(kRegExpBranch));
Tail(ret, Node(kRegExpNothing));
} else if (op == '?') {
Insert(kRegExpBranch, ret);
Tail(ret, Node(kRegExpBranch));
next = Node(kRegExpNothing);
Tail(ret, next);
OpTail(ret, next);
}
fInputScanPointer++;
if (IsMult(*fInputScanPointer)) {
SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
return NULL;
}
return ret;
}
char*
RegExp::Atom(int32* flagp)
{
char* ret;
int32 flags;
*flagp = kWorst;
switch (*fInputScanPointer++) {
case '^':
ret = Node(kRegExpBol);
break;
case '$':
ret = Node(kRegExpEol);
break;
case '.':
ret = Node(kRegExpAny);
*flagp |= kHasWidth|kSimple;
break;
case '[':
{
int32 cclass;
int32 classend;
if (*fInputScanPointer == '^') {
ret = Node(kRegExpAnyBut);
fInputScanPointer++;
} else
ret = Node(kRegExpAnyOf);
if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
Char(*fInputScanPointer++);
while (*fInputScanPointer != '\0'
&& *fInputScanPointer != ']') {
if (*fInputScanPointer == '-') {
fInputScanPointer++;
if (*fInputScanPointer == ']'
|| *fInputScanPointer == '\0') {
Char('-');
} else {
cclass = UCharAt(fInputScanPointer - 2) + 1;
classend = UCharAt(fInputScanPointer);
if (cclass > classend + 1) {
SetError(REGEXP_INVALID_BRACKET_RANGE);
return NULL;
}
for (; cclass <= classend; cclass++)
Char((char)cclass);
fInputScanPointer++;
}
} else
Char(*fInputScanPointer++);
}
Char('\0');
if (*fInputScanPointer != ']') {
SetError(REGEXP_UNMATCHED_BRACKET);
return NULL;
}
fInputScanPointer++;
*flagp |= kHasWidth | kSimple;
break;
}
case '(':
ret = Reg(1, &flags);
if (ret == NULL)
return NULL;
*flagp |= flags & (kHasWidth | kSPStart);
break;
case '\0':
case '|':
case ')':
SetError(REGEXP_INTERNAL_ERROR);
return NULL;
case '?':
case '+':
case '*':
SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
return NULL;
case '\\':
if (*fInputScanPointer == '\0') {
SetError(REGEXP_TRAILING_BACKSLASH);
return NULL;
}
ret = Node(kRegExpExactly);
Char(*fInputScanPointer++);
Char('\0');
*flagp |= kHasWidth|kSimple;
break;
default:
{
int32 len;
char ender;
fInputScanPointer--;
len = (int32)strcspn(fInputScanPointer, kMeta);
if (len <= 0) {
SetError(REGEXP_INTERNAL_ERROR);
return NULL;
}
ender = *(fInputScanPointer + len);
if (len > 1 && IsMult(ender)) {
len--;
}
*flagp |= kHasWidth;
if (len == 1)
*flagp |= kSimple;
ret = Node(kRegExpExactly);
while (len > 0) {
Char(*fInputScanPointer++);
len--;
}
Char('\0');
break;
}
}
return ret;
}
char*
RegExp::Node(char op)
{
char* ret;
char* ptr;
ret = fCodeEmitPointer;
if (ret == &fDummy) {
fCodeSize += 3;
return ret;
}
ptr = ret;
*ptr++ = op;
*ptr++ = '\0';
*ptr++ = '\0';
fCodeEmitPointer = ptr;
return ret;
}
void
RegExp::Char(char b)
{
if (fCodeEmitPointer != &fDummy)
*fCodeEmitPointer++ = b;
else
fCodeSize++;
}
void
RegExp::Insert(char op, char* opnd)
{
char* src;
char* dst;
char* place;
if (fCodeEmitPointer == &fDummy) {
fCodeSize += 3;
return;
}
src = fCodeEmitPointer;
fCodeEmitPointer += 3;
dst = fCodeEmitPointer;
while (src > opnd)
*--dst = *--src;
place = opnd;
*place++ = op;
*place++ = '\0';
*place++ = '\0';
}
void
RegExp::Tail(char* p, char* val)
{
char* scan;
char* temp;
int32 offset;
if (p == &fDummy)
return;
scan = p;
for (;;) {
temp = Next(scan);
if (temp == NULL)
break;
scan = temp;
}
if (scan[0] == kRegExpBack)
offset = scan - val;
else
offset = val - scan;
scan[1] = (char)((offset >> 8) & 0377);
scan[2] = (char)(offset & 0377);
}
void
RegExp::OpTail(char* p, char* val)
{
if (p == NULL || p == &fDummy || *p != kRegExpBranch)
return;
Tail(Operand(p), val);
}
int32
RegExp::RunMatcher(regexp* prog, const char* string) const
{
const char* s;
if (prog == NULL || string == NULL) {
SetError(B_BAD_VALUE);
return 0;
}
if (UCharAt(prog->program) != kRegExpMagic) {
SetError(REGEXP_CORRUPTED_PROGRAM);
return 0;
}
if (prog->regmust != NULL) {
s = string;
while ((s = strchr(s, prog->regmust[0])) != NULL) {
if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
break;
}
s++;
}
if (s == NULL) {
return 0;
}
}
fRegBol = string;
if (prog->reganch)
return Try(prog, (char*)string);
s = string;
if (prog->regstart != '\0') {
while ((s = strchr(s, prog->regstart)) != NULL) {
if (Try(prog, (char*)s))
return 1;
s++;
}
} else {
do {
if (Try(prog, (char*)s))
return 1;
} while (*s++ != '\0');
}
return 0;
}
int32
RegExp::Try(regexp* prog, const char* string) const
{
int32 i;
const char **sp;
const char **ep;
fStringInputPointer = string;
fStartPArrayPointer = prog->startp;
fEndPArrayPointer = prog->endp;
sp = prog->startp;
ep = prog->endp;
for (i = kSubExpressionMax; i > 0; i--) {
*sp++ = NULL;
*ep++ = NULL;
}
if (Match(prog->program + 1)) {
prog->startp[0] = string;
prog->endp[0] = fStringInputPointer;
return 1;
} else
return 0;
}
int32
RegExp::Match(const char* prog) const
{
const char* scan;
const char* next;
scan = prog;
#ifdef DEBUG
if (scan != NULL && regnarrate)
fprintf(stderr, "%s(\n", Prop(scan));
#endif
while (scan != NULL) {
#ifdef DEBUG
if (regnarrate)
fprintf(stderr, "%s...\n", Prop(scan));
#endif
next = Next(scan);
switch (*scan) {
case kRegExpBol:
if (fStringInputPointer != fRegBol)
return 0;
break;
case kRegExpEol:
if (*fStringInputPointer != '\0')
return 0;
break;
case kRegExpAny:
if (*fStringInputPointer == '\0')
return 0;
fStringInputPointer++;
break;
case kRegExpExactly:
{
const char* opnd = Operand(scan);
if (*opnd != *fStringInputPointer)
return 0;
uint32 len = strlen(opnd);
if (len > 1
&& strncmp(opnd, fStringInputPointer, len) != 0) {
return 0;
}
fStringInputPointer += len;
}
break;
case kRegExpAnyOf:
if (*fStringInputPointer == '\0'
|| strchr(Operand(scan), *fStringInputPointer) == NULL) {
return 0;
}
fStringInputPointer++;
break;
case kRegExpAnyBut:
if (*fStringInputPointer == '\0'
|| strchr(Operand(scan), *fStringInputPointer) != NULL) {
return 0;
}
fStringInputPointer++;
break;
case kRegExpNothing:
break;
case kRegExpBack:
break;
case kRegExpOpen + 1:
case kRegExpOpen + 2:
case kRegExpOpen + 3:
case kRegExpOpen + 4:
case kRegExpOpen + 5:
case kRegExpOpen + 6:
case kRegExpOpen + 7:
case kRegExpOpen + 8:
case kRegExpOpen + 9:
{
int32 no;
const char* save;
no = *scan - kRegExpOpen;
save = fStringInputPointer;
if (Match(next)) {
if (fStartPArrayPointer[no] == NULL)
fStartPArrayPointer[no] = save;
return 1;
} else
return 0;
}
break;
case kRegExpClose + 1:
case kRegExpClose + 2:
case kRegExpClose + 3:
case kRegExpClose + 4:
case kRegExpClose + 5:
case kRegExpClose + 6:
case kRegExpClose + 7:
case kRegExpClose + 8:
case kRegExpClose + 9:
{
int32 no;
const char* save;
no = *scan - kRegExpClose;
save = fStringInputPointer;
if (Match(next)) {
if (fEndPArrayPointer[no] == NULL)
fEndPArrayPointer[no] = save;
return 1;
} else
return 0;
}
break;
case kRegExpBranch:
{
const char* save;
if (*next != kRegExpBranch) {
next = Operand(scan);
} else {
do {
save = fStringInputPointer;
if (Match(Operand(scan)))
return 1;
fStringInputPointer = save;
scan = Next(scan);
} while (scan != NULL && *scan == kRegExpBranch);
return 0;
}
}
break;
case kRegExpStar:
case kRegExpPlus:
{
char nextch;
int32 no;
const char* save;
int32 min;
nextch = '\0';
if (*next == kRegExpExactly)
nextch = *Operand(next);
min = (*scan == kRegExpStar) ? 0 : 1;
save = fStringInputPointer;
no = Repeat(Operand(scan));
while (no >= min) {
if (nextch == '\0' || *fStringInputPointer == nextch) {
if (Match(next))
return 1;
}
no--;
fStringInputPointer = save + no;
}
return 0;
}
break;
case kRegExpEnd:
return 1;
default:
SetError(REGEXP_MEMORY_CORRUPTION);
return 0;
}
scan = next;
}
SetError(REGEXP_CORRUPTED_POINTERS);
return 0;
}
int32
RegExp::Repeat(const char* p) const
{
int32 count = 0;
const char* scan;
const char* opnd;
scan = fStringInputPointer;
opnd = Operand(p);
switch (*p) {
case kRegExpAny:
count = (int32)strlen(scan);
scan += count;
break;
case kRegExpExactly:
while (*opnd == *scan) {
count++;
scan++;
}
break;
case kRegExpAnyOf:
while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
count++;
scan++;
}
break;
case kRegExpAnyBut:
while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
count++;
scan++;
}
break;
default:
SetError(REGEXP_INTERNAL_ERROR);
count = 0;
break;
}
fStringInputPointer = scan;
return count;
}
char*
RegExp::Next(char* p)
{
int32 offset;
if (p == &fDummy)
return NULL;
offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
if (offset == 0)
return NULL;
if (*p == kRegExpBack)
return p - offset;
else
return p + offset;
}
const char*
RegExp::Next(const char* p) const
{
int32 offset;
if (p == &fDummy)
return NULL;
offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
if (offset == 0)
return NULL;
if (*p == kRegExpBack)
return p - offset;
else
return p + offset;
}
inline int32
RegExp::UCharAt(const char* p) const
{
return (int32)*(unsigned char *)p;
}
inline char*
RegExp::Operand(char* p) const
{
return p + 3;
}
inline const char*
RegExp::Operand(const char* p) const
{
return p + 3;
}
inline bool
RegExp::IsMult(char c) const
{
return c == '*' || c == '+' || c == '?';
}
#ifdef DEBUG
void
RegExp::Dump()
{
const char* s;
char op = kRegExpExactly;
const char* next;
s = fRegExp->program + 1;
while (op != kRegExpEnd) {
op = *s;
printf("%2ld%s", s - fRegExp->program, Prop(s));
next = Next(s);
if (next == NULL) {
printf("(0)");
} else
printf("(%ld)", (s - fRegExp->program) + (next - s));
s += 3;
if (op == kRegExpAnyOf || op == kRegExpAnyBut
|| op == kRegExpExactly) {
while (*s != '\0') {
putchar(*s);
s++;
}
s++;
}
putchar('\n');
}
if (fRegExp->regstart != '\0')
printf("start `%c' ", fRegExp->regstart);
if (fRegExp->reganch)
printf("anchored ");
if (fRegExp->regmust != NULL)
printf("must have \"%s\"", fRegExp->regmust);
printf("\n");
}
char*
RegExp::Prop(const char* op) const
{
const char* p = NULL;
static char buf[50];
strcpy(buf, ":");
switch (*op) {
case kRegExpBol:
p = "kRegExpBol";
break;
case kRegExpEol:
p = "kRegExpEol";
break;
case kRegExpAny:
p = "kRegExpAny";
break;
case kRegExpAnyOf:
p = "kRegExpAnyOf";
break;
case kRegExpAnyBut:
p = "kRegExpAnyBut";
break;
case kRegExpBranch:
p = "kRegExpBranch";
break;
case kRegExpExactly:
p = "kRegExpExactly";
break;
case kRegExpNothing:
p = "kRegExpNothing";
break;
case kRegExpBack:
p = "kRegExpBack";
break;
case kRegExpEnd:
p = "kRegExpEnd";
break;
case kRegExpOpen + 1:
case kRegExpOpen + 2:
case kRegExpOpen + 3:
case kRegExpOpen + 4:
case kRegExpOpen + 5:
case kRegExpOpen + 6:
case kRegExpOpen + 7:
case kRegExpOpen + 8:
case kRegExpOpen + 9:
sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
p = NULL;
break;
case kRegExpClose + 1:
case kRegExpClose + 2:
case kRegExpClose + 3:
case kRegExpClose + 4:
case kRegExpClose + 5:
case kRegExpClose + 6:
case kRegExpClose + 7:
case kRegExpClose + 8:
case kRegExpClose + 9:
sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
p = NULL;
break;
case kRegExpStar:
p = "kRegExpStar";
break;
case kRegExpPlus:
p = "kRegExpPlus";
break;
default:
RegExpError("corrupted opcode");
break;
}
if (p != NULL)
strcat(buf, p);
return buf;
}
void
RegExp::RegExpError(const char*) const
{
}
#endif