forked from cory/tildefriends
Merge branches/quickjs to trunk. This is the way.
git-svn-id: https://www.unprompted.com/svn/projects/tildefriends/trunk@3621 ed5197a5-7fde-0310-b194-c3ffbd925b24
This commit is contained in:
187
deps/quickjs/libregexp.c
vendored
187
deps/quickjs/libregexp.c
vendored
@@ -75,7 +75,7 @@ typedef struct {
|
||||
int capture_count;
|
||||
int total_capture_count; /* -1 = not computed yet */
|
||||
int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
|
||||
void *mem_opaque;
|
||||
void *opaque;
|
||||
DynBuf group_names;
|
||||
union {
|
||||
char error_msg[TMP_BUF_SIZE];
|
||||
@@ -110,12 +110,14 @@ static inline int is_digit(int c) {
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
/* insert 'len' bytes at position 'pos' */
|
||||
static void dbuf_insert(DynBuf *s, int pos, int len)
|
||||
/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
|
||||
static int dbuf_insert(DynBuf *s, int pos, int len)
|
||||
{
|
||||
dbuf_realloc(s, s->size + len);
|
||||
if (dbuf_realloc(s, s->size + len))
|
||||
return -1;
|
||||
memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
|
||||
s->size += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* canonicalize with the specific JS regexp rules */
|
||||
@@ -228,7 +230,7 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
|
||||
invert = c & 1;
|
||||
c_pt = char_range_table[c >> 1];
|
||||
len = *c_pt++;
|
||||
cr_init(cr, s->mem_opaque, lre_realloc);
|
||||
cr_init(cr, s->opaque, lre_realloc);
|
||||
for(i = 0; i < len * 2; i++) {
|
||||
if (cr_add_point(cr, c_pt[i]))
|
||||
goto fail;
|
||||
@@ -434,8 +436,14 @@ static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s,
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Return -1 in case of overflow */
|
||||
static int parse_digits(const uint8_t **pp)
|
||||
static int re_parse_out_of_memory(REParseState *s)
|
||||
{
|
||||
return re_parse_error(s, "out of memory");
|
||||
}
|
||||
|
||||
/* If allow_overflow is false, return -1 in case of
|
||||
overflow. Otherwise return INT32_MAX. */
|
||||
static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
|
||||
{
|
||||
const uint8_t *p;
|
||||
uint64_t v;
|
||||
@@ -448,8 +456,12 @@ static int parse_digits(const uint8_t **pp)
|
||||
if (c < '0' || c > '9')
|
||||
break;
|
||||
v = v * 10 + c - '0';
|
||||
if (v >= INT32_MAX)
|
||||
return -1;
|
||||
if (v >= INT32_MAX) {
|
||||
if (allow_overflow)
|
||||
v = INT32_MAX;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
p++;
|
||||
}
|
||||
*pp = p;
|
||||
@@ -557,7 +569,8 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '0' ... '7':
|
||||
case '0': case '1': case '2': case '3':
|
||||
case '4': case '5': case '6': case '7':
|
||||
c -= '0';
|
||||
if (allow_utf16 == 2) {
|
||||
/* only accept \0 not followed by digit */
|
||||
@@ -612,7 +625,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||
p++;
|
||||
q = name;
|
||||
while (is_unicode_char(*p)) {
|
||||
if ((q - name) > sizeof(name) - 1)
|
||||
if ((q - name) >= sizeof(name) - 1)
|
||||
goto unknown_property_name;
|
||||
*q++ = *p++;
|
||||
}
|
||||
@@ -621,7 +634,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||
if (*p == '=') {
|
||||
p++;
|
||||
while (is_unicode_char(*p)) {
|
||||
if ((q - value) > sizeof(value) - 1)
|
||||
if ((q - value) >= sizeof(value) - 1)
|
||||
return re_parse_error(s, "unknown unicode property value");
|
||||
*q++ = *p++;
|
||||
}
|
||||
@@ -638,7 +651,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||
} else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
|
||||
script_ext = TRUE;
|
||||
do_script:
|
||||
cr_init(cr, s->mem_opaque, lre_realloc);
|
||||
cr_init(cr, s->opaque, lre_realloc);
|
||||
ret = unicode_script(cr, value, script_ext);
|
||||
if (ret) {
|
||||
cr_free(cr);
|
||||
@@ -648,7 +661,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||
goto out_of_memory;
|
||||
}
|
||||
} else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
|
||||
cr_init(cr, s->mem_opaque, lre_realloc);
|
||||
cr_init(cr, s->opaque, lre_realloc);
|
||||
ret = unicode_general_category(cr, value);
|
||||
if (ret) {
|
||||
cr_free(cr);
|
||||
@@ -658,7 +671,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||
goto out_of_memory;
|
||||
}
|
||||
} else if (value[0] == '\0') {
|
||||
cr_init(cr, s->mem_opaque, lre_realloc);
|
||||
cr_init(cr, s->opaque, lre_realloc);
|
||||
ret = unicode_general_category(cr, name);
|
||||
if (ret == -1) {
|
||||
cr_free(cr);
|
||||
@@ -688,7 +701,7 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
||||
*pp = p;
|
||||
return 0;
|
||||
out_of_memory:
|
||||
return re_parse_error(s, "out of memory");
|
||||
return re_parse_out_of_memory(s);
|
||||
}
|
||||
#endif /* CONFIG_ALL_UNICODE */
|
||||
|
||||
@@ -851,7 +864,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
|
||||
CharRange cr1_s, *cr1 = &cr1_s;
|
||||
BOOL invert;
|
||||
|
||||
cr_init(cr, s->mem_opaque, lre_realloc);
|
||||
cr_init(cr, s->opaque, lre_realloc);
|
||||
p = *pp;
|
||||
p++; /* skip '[' */
|
||||
invert = FALSE;
|
||||
@@ -923,7 +936,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
|
||||
*pp = p;
|
||||
return 0;
|
||||
memory_error:
|
||||
re_parse_error(s, "out of memory");
|
||||
re_parse_out_of_memory(s);
|
||||
fail:
|
||||
cr_free(cr);
|
||||
return -1;
|
||||
@@ -1134,9 +1147,13 @@ static int re_parse_captures(REParseState *s, int *phas_named_captures,
|
||||
}
|
||||
}
|
||||
capture_index++;
|
||||
if (capture_index >= CAPTURE_COUNT_MAX)
|
||||
goto done;
|
||||
}
|
||||
} else {
|
||||
capture_index++;
|
||||
if (capture_index >= CAPTURE_COUNT_MAX)
|
||||
goto done;
|
||||
}
|
||||
break;
|
||||
case '\\':
|
||||
@@ -1150,6 +1167,7 @@ static int re_parse_captures(REParseState *s, int *phas_named_captures,
|
||||
break;
|
||||
}
|
||||
}
|
||||
done:
|
||||
if (capture_name)
|
||||
return -1;
|
||||
else
|
||||
@@ -1225,14 +1243,27 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
re_emit_op(s, REOP_prev);
|
||||
break;
|
||||
case '{':
|
||||
/* As an extension (see ES6 annex B), we accept '{' not
|
||||
followed by digits as a normal atom */
|
||||
if (!is_digit(p[1])) {
|
||||
if (s->is_utf16)
|
||||
goto invalid_quant_count;
|
||||
if (s->is_utf16) {
|
||||
return re_parse_error(s, "syntax error");
|
||||
} else if (!is_digit(p[1])) {
|
||||
/* Annex B: we accept '{' not followed by digits as a
|
||||
normal atom */
|
||||
goto parse_class_atom;
|
||||
} else {
|
||||
const uint8_t *p1 = p + 1;
|
||||
/* Annex B: error if it is like a repetition count */
|
||||
parse_digits(&p1, TRUE);
|
||||
if (*p1 == ',') {
|
||||
p1++;
|
||||
if (is_digit(*p1)) {
|
||||
parse_digits(&p1, TRUE);
|
||||
}
|
||||
}
|
||||
if (*p1 != '}') {
|
||||
goto parse_class_atom;
|
||||
}
|
||||
}
|
||||
/* fall tru */
|
||||
/* fall thru */
|
||||
case '*':
|
||||
case '+':
|
||||
case '?':
|
||||
@@ -1277,6 +1308,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
return -1;
|
||||
re_emit_op(s, REOP_match);
|
||||
/* jump after the 'match' after the lookahead is successful */
|
||||
if (dbuf_error(&s->byte_code))
|
||||
return -1;
|
||||
put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
|
||||
} else if (p[2] == '<') {
|
||||
p += 3;
|
||||
@@ -1383,11 +1416,13 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
}
|
||||
}
|
||||
goto normal_char;
|
||||
case '1' ... '9':
|
||||
case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8':
|
||||
case '9':
|
||||
{
|
||||
const uint8_t *q = ++p;
|
||||
|
||||
c = parse_digits(&p);
|
||||
c = parse_digits(&p, FALSE);
|
||||
if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
|
||||
if (!s->is_utf16) {
|
||||
/* Annex B.1.4: accept legacy octal */
|
||||
@@ -1407,7 +1442,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
}
|
||||
goto normal_char;
|
||||
}
|
||||
return re_parse_error(s, "back reference out of range in reguar expression");
|
||||
return re_parse_error(s, "back reference out of range in regular expression");
|
||||
}
|
||||
emit_back_reference:
|
||||
last_atom_start = s->byte_code.size;
|
||||
@@ -1484,32 +1519,38 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
quant_max = 1;
|
||||
goto quantifier;
|
||||
case '{':
|
||||
/* As an extension (see ES6 annex B), we accept '{' not
|
||||
followed by digits as a normal atom */
|
||||
if (!is_digit(p[1])) {
|
||||
if (s->is_utf16)
|
||||
goto invalid_quant_count;
|
||||
break;
|
||||
}
|
||||
p++;
|
||||
quant_min = parse_digits(&p);
|
||||
if (quant_min < 0) {
|
||||
invalid_quant_count:
|
||||
return re_parse_error(s, "invalid repetition count");
|
||||
}
|
||||
quant_max = quant_min;
|
||||
if (*p == ',') {
|
||||
p++;
|
||||
if (is_digit(*p)) {
|
||||
quant_max = parse_digits(&p);
|
||||
if (quant_max < 0 || quant_max < quant_min)
|
||||
{
|
||||
const uint8_t *p1 = p;
|
||||
/* As an extension (see ES6 annex B), we accept '{' not
|
||||
followed by digits as a normal atom */
|
||||
if (!is_digit(p[1])) {
|
||||
if (s->is_utf16)
|
||||
goto invalid_quant_count;
|
||||
} else {
|
||||
quant_max = INT32_MAX; /* infinity */
|
||||
break;
|
||||
}
|
||||
p++;
|
||||
quant_min = parse_digits(&p, TRUE);
|
||||
quant_max = quant_min;
|
||||
if (*p == ',') {
|
||||
p++;
|
||||
if (is_digit(*p)) {
|
||||
quant_max = parse_digits(&p, TRUE);
|
||||
if (quant_max < quant_min) {
|
||||
invalid_quant_count:
|
||||
return re_parse_error(s, "invalid repetition count");
|
||||
}
|
||||
} else {
|
||||
quant_max = INT32_MAX; /* infinity */
|
||||
}
|
||||
}
|
||||
if (*p != '}' && !s->is_utf16) {
|
||||
/* Annex B: normal atom if invalid '{' syntax */
|
||||
p = p1;
|
||||
break;
|
||||
}
|
||||
if (re_parse_expect(s, &p, '}'))
|
||||
return -1;
|
||||
}
|
||||
if (re_parse_expect(s, &p, '}'))
|
||||
return -1;
|
||||
quantifier:
|
||||
greedy = TRUE;
|
||||
if (*p == '?') {
|
||||
@@ -1524,12 +1565,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
|
||||
if (quant_max > 0) {
|
||||
/* specific optimization for simple quantifiers */
|
||||
if (dbuf_error(&s->byte_code))
|
||||
goto out_of_memory;
|
||||
len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
|
||||
s->byte_code.size - last_atom_start);
|
||||
if (len > 0) {
|
||||
re_emit_op(s, REOP_match);
|
||||
|
||||
dbuf_insert(&s->byte_code, last_atom_start, 17);
|
||||
if (dbuf_insert(&s->byte_code, last_atom_start, 17))
|
||||
goto out_of_memory;
|
||||
pos = last_atom_start;
|
||||
s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
|
||||
put_u32(&s->byte_code.buf[pos],
|
||||
@@ -1545,6 +1589,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
}
|
||||
}
|
||||
|
||||
if (dbuf_error(&s->byte_code))
|
||||
goto out_of_memory;
|
||||
add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
|
||||
s->byte_code.size - last_atom_start) == 0);
|
||||
} else {
|
||||
@@ -1558,7 +1604,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
/* need to reset the capture in case the atom is
|
||||
not executed */
|
||||
if (last_capture_count != s->capture_count) {
|
||||
dbuf_insert(&s->byte_code, last_atom_start, 3);
|
||||
if (dbuf_insert(&s->byte_code, last_atom_start, 3))
|
||||
goto out_of_memory;
|
||||
s->byte_code.buf[last_atom_start++] = REOP_save_reset;
|
||||
s->byte_code.buf[last_atom_start++] = last_capture_count;
|
||||
s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
|
||||
@@ -1566,12 +1613,14 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
if (quant_max == 0) {
|
||||
s->byte_code.size = last_atom_start;
|
||||
} else if (quant_max == 1) {
|
||||
dbuf_insert(&s->byte_code, last_atom_start, 5);
|
||||
if (dbuf_insert(&s->byte_code, last_atom_start, 5))
|
||||
goto out_of_memory;
|
||||
s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
|
||||
greedy;
|
||||
put_u32(s->byte_code.buf + last_atom_start + 1, len);
|
||||
} else if (quant_max == INT32_MAX) {
|
||||
dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check);
|
||||
if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
|
||||
goto out_of_memory;
|
||||
s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
|
||||
greedy;
|
||||
put_u32(s->byte_code.buf + last_atom_start + 1,
|
||||
@@ -1587,7 +1636,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
re_emit_goto(s, REOP_goto, last_atom_start);
|
||||
}
|
||||
} else {
|
||||
dbuf_insert(&s->byte_code, last_atom_start, 10);
|
||||
if (dbuf_insert(&s->byte_code, last_atom_start, 10))
|
||||
goto out_of_memory;
|
||||
pos = last_atom_start;
|
||||
s->byte_code.buf[pos++] = REOP_push_i32;
|
||||
put_u32(s->byte_code.buf + pos, quant_max);
|
||||
@@ -1605,7 +1655,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
if (quant_min == 1) {
|
||||
/* nothing to add */
|
||||
} else {
|
||||
dbuf_insert(&s->byte_code, last_atom_start, 5);
|
||||
if (dbuf_insert(&s->byte_code, last_atom_start, 5))
|
||||
goto out_of_memory;
|
||||
s->byte_code.buf[last_atom_start] = REOP_push_i32;
|
||||
put_u32(s->byte_code.buf + last_atom_start + 1,
|
||||
quant_min);
|
||||
@@ -1646,6 +1697,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
||||
done:
|
||||
s->buf_ptr = p;
|
||||
return 0;
|
||||
out_of_memory:
|
||||
return re_parse_out_of_memory(s);
|
||||
}
|
||||
|
||||
static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
|
||||
@@ -1686,6 +1739,9 @@ static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
|
||||
{
|
||||
int start, len, pos;
|
||||
|
||||
if (lre_check_stack_overflow(s->opaque, 0))
|
||||
return re_parse_error(s, "stack overflow");
|
||||
|
||||
start = s->byte_code.size;
|
||||
if (re_parse_alternative(s, is_backward_dir))
|
||||
return -1;
|
||||
@@ -1695,7 +1751,9 @@ static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
|
||||
len = s->byte_code.size - start;
|
||||
|
||||
/* insert a split before the first alternative */
|
||||
dbuf_insert(&s->byte_code, start, 5);
|
||||
if (dbuf_insert(&s->byte_code, start, 5)) {
|
||||
return re_parse_out_of_memory(s);
|
||||
}
|
||||
s->byte_code.buf[start] = REOP_split_next_first;
|
||||
put_u32(s->byte_code.buf + start + 1, len + 5);
|
||||
|
||||
@@ -1769,7 +1827,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
|
||||
BOOL is_sticky;
|
||||
|
||||
memset(s, 0, sizeof(*s));
|
||||
s->mem_opaque = opaque;
|
||||
s->opaque = opaque;
|
||||
s->buf_ptr = (const uint8_t *)buf;
|
||||
s->buf_end = s->buf_ptr + buf_len;
|
||||
s->buf_start = s->buf_ptr;
|
||||
@@ -1820,7 +1878,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
|
||||
}
|
||||
|
||||
if (dbuf_error(&s->byte_code)) {
|
||||
re_parse_error(s, "out of memory");
|
||||
re_parse_out_of_memory(s);
|
||||
goto error;
|
||||
}
|
||||
|
||||
@@ -2486,6 +2544,17 @@ int lre_get_flags(const uint8_t *bc_buf)
|
||||
return bc_buf[RE_HEADER_FLAGS];
|
||||
}
|
||||
|
||||
/* Return NULL if no group names. Otherwise, return a pointer to
|
||||
'capture_count - 1' zero terminated UTF-8 strings. */
|
||||
const char *lre_get_groupnames(const uint8_t *bc_buf)
|
||||
{
|
||||
uint32_t re_bytecode_len;
|
||||
if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
|
||||
return NULL;
|
||||
re_bytecode_len = get_u32(bc_buf + 3);
|
||||
return (const char *)(bc_buf + 7 + re_bytecode_len);
|
||||
}
|
||||
|
||||
#ifdef TEST
|
||||
|
||||
BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
|
||||
|
Reference in New Issue
Block a user