#include "common.h" enum { max_commands = 16, inputbuf_size = 32 * 1024, regbuf_size = 40, text_size = 1024, }; struct command { struct str condition; struct str body; struct str result; }; enum regex_type { rx_char, rx_any, rx_range, rx_rep, rx_rep_nongreedy, rx_concat, }; struct regex { enum regex_type type; int submatch_id; union { char c; unsigned long long table[2]; struct regex *child; struct { struct regex **children; int childnum; }; }; }; struct regbuf { struct regex *buf[regbuf_size]; int size; }; struct parsing_state { int candidate_id; char ids[10]; }; enum opcode { op_char, /* char c */ op_match, /* match */ op_any, /* any */ op_range, /* range [...] */ op_jmp, /* jmp L1 */ op_split, /* split L1, L2 */ op_save, /* save n */ }; struct ins { enum opcode opcode; union { char c; unsigned long long table[2]; int n; struct ins *arg_labels[2]; }; }; struct regex_prog { struct ins *text; int len; int size; }; struct submatches { char *saves[20]; int refcount; }; struct thread { struct ins *pc; struct submatches *submatches; }; struct node { struct node *next; struct node *prev; struct thread t; }; struct threadlist { struct node *anchor; struct node anchor_body; struct ins *ref; struct node **presence_table; }; struct input_buf { char buf[inputbuf_size]; char *start; char *end; } input_buf; struct compiled_command { struct str str_condition; struct regex_prog condition; struct str str_body; struct regex_prog body; struct str str_result; }; static struct compiled_command command_buf[max_commands]; static int command_num = 0; static void inbuf_init() { input_buf.start = input_buf.end = input_buf.buf; } static int readline(struct str *line) { char *start = input_buf.start; char *tmp; int none_processed = 1; for (;;) { for (tmp = start; tmp != input_buf.end; ++tmp) { none_processed = 0; if (*tmp == '\n') { line->start = input_buf.start; line->end = tmp; input_buf.start = tmp + 1; return 1; } } if (input_buf.end < input_buf.buf + inputbuf_size / 2) { int rem, ret; read: rem = input_buf.buf + inputbuf_size - input_buf.end; ret = read(0, input_buf.end, rem); if (ret == -1) syscall_error("stdin"); if (ret == 0) { if (none_processed) return 0; error("unexpected input line end"); } start = input_buf.end; input_buf.end += ret; } else { int size; if (input_buf.start == input_buf.buf) error("input line is too long"); size = input_buf.end - input_buf.start; memmove(input_buf.buf, input_buf.start, size); input_buf.start = input_buf.buf; input_buf.end = input_buf.buf + size; goto read; } } } static struct regex *regex_alloc(enum regex_type type) { struct regex *tmp = malloc(sizeof(struct regex)); tmp->type = type; tmp->submatch_id = -1; return tmp; } static void regbuf_add(struct regbuf *buf, struct regex *r) { if (buf->size == regbuf_size) error("regbuf: no more space"); buf->buf[buf->size] = r; ++buf->size; } static struct regex *regbuf_pop(struct regbuf *buf) { if (buf->size == 0) return NULL; --buf->size; return buf->buf[buf->size]; } static char *find_end(struct str *s) { for (; s->start < s->end; ++s->start) { if (*s->start == ']') { char *tmp = s->start; s->start = tmp + 1; return tmp; } } error("regex_parse: missing ']'"); return NULL; } static void __parse_brackets(struct regex *r, struct str *s) { unsigned long long constant; char *tmp; if (s->start == s->end) goto error; if (*s->start == '^') { constant = -1ull; ++s->start; if (s->start == s->end) goto error; } else { constant = 0; } r->table[0] = r->table[1] = constant; for (tmp = s->start; tmp != s->end; ++tmp) { /* flip bit */ r->table[*tmp / 64] ^= (1ull << (*tmp % 64)); } return; error: error("regex_parse: met [] or [^]"); } static void parse_brackets(struct regbuf *buf, struct str *rx) { struct str brackets; struct regex *r; ++rx->start; brackets = (struct str) { rx->start, find_end(rx) }; r = regex_alloc(rx_range); __parse_brackets(r, &brackets); regbuf_add(buf, r); } static void parse_rep(struct regbuf *buf, struct str *rx) { enum regex_type type; struct regex *r, *tmp; if (rx->start + 1 != rx->end && rx->start[1] == '?') { type = rx_rep_nongreedy; rx->start += 2; } else { type = rx_rep; ++rx->start; } r = regex_alloc(type); tmp = regbuf_pop(buf); if (tmp == NULL) error("regex_parse: stray '*'"); r->child = tmp; regbuf_add(buf, r); } static char *find_pend(struct str *rx) { int level = 0; for (; rx->start + 1 < rx->end; ++rx->start) { if (*rx->start == '\\' && rx->start[1] == '(') ++level; if (*rx->start == '\\' && rx->start[1] == ')') { if (level == 0) { char *tmp = rx->start; rx->start += 2; return tmp; } else { --level; } } } error("regex_parse: missing \\)"); return NULL; } static int pick_id(struct parsing_state *state) { int i; if (state->candidate_id != -1) { int tmp = state->candidate_id; state->ids[tmp] = 0; state->candidate_id = -1; return tmp; } /* 0, 8 and 9 are for internal use, so they must be requested * explicitly" */ for (i = 1; i < 8; ++i) { if (state->ids[i] == 1) { state->ids[i] = 0; return i; } } error("regex_parse: space of submatch ids is exhausted"); return -1; } static struct regex *__regex_parse(struct parsing_state *state, struct str *rx); static void parse_parenthesis(struct parsing_state *state, struct regbuf *buf, struct str *rx) { struct str parenthesis; struct regex *r; int id; ++rx->start; id = pick_id(state); parenthesis = (struct str) { rx->start, find_pend(rx) }; r = __regex_parse(state, &parenthesis); r->submatch_id = id; regbuf_add(buf, r); } static void parse_backslash(struct parsing_state *state, struct regbuf *buf, struct str *rx) { struct regex *r; if (rx->start + 1 == rx->end) error("regex_parse: stray '\\'"); ++rx->start; switch(*rx->start) { case '(': parse_parenthesis(state, buf, rx); break; case '0' ... '9': state->candidate_id = *rx->start - '0'; ++rx->start; break; default: r = regex_alloc(rx_char); r->c = *rx->start; regbuf_add(buf, r); ++rx->start; } } static struct regex *collect(struct regex **buf, int len) { if (len == 0) error("regex_parse: empty regex"); if (len == 1) { return *buf; } else { int size; struct regex *r = regex_alloc(rx_concat); r->childnum = len; size = sizeof(struct regex *) * len; r->children = malloc(size); memcpy(r->children, buf, size); return r; } } static struct regex *__regex_parse(struct parsing_state *state, struct str *rx) { struct regbuf rbuf; struct regex *r; rbuf.size = 0; while (rx->start != rx->end) { switch(*rx->start) { case '.': r = regex_alloc(rx_any); break; case '[': parse_brackets(&rbuf, rx); continue; case '*': parse_rep(&rbuf, rx); continue; case '\\': parse_backslash(state, &rbuf, rx); continue; default: r = regex_alloc(rx_char); r->c = *rx->start; } regbuf_add(&rbuf, r); ++rx->start; } return collect(rbuf.buf, rbuf.size); } static struct regex *regex_parse(struct str rx) { struct parsing_state state; struct regex *buf[3]; int idx = 0, inc = 0; memset(state.ids, 1, 10); state.candidate_id = -1; if (rx.start != rx.end && *rx.start != '^') buf[idx++] = __regex_parse(&state, &CSTR("\\8\\(.*?\\)")); else ++rx.start; if (rx.start != rx.end && rx.end[-1] != '$') { buf[idx + 1] = __regex_parse(&state, &CSTR("\\9\\(.*\\)")); inc = 1; } else { --rx.end; } buf[idx] = __regex_parse(&state, &rx); if (buf[idx]->submatch_id != -1) error("regex_parse: whole regex receives id=0 implicitly"); buf[idx]->submatch_id = 0; ++idx; idx += inc; return collect(buf, idx); } static char *print_range(unsigned long long *table) { static char buf[68]; int idx = 0; int i; unsigned long long tcopy[2] = { table[0], table[1] }; buf[idx++] = '['; int raised_bits = __builtin_popcountll(tcopy[0]) + __builtin_popcountll(tcopy[1]); if (raised_bits > 64) { buf[idx++] = '^'; tcopy[0] = ~tcopy[0]; tcopy[1] = ~tcopy[1]; } for (i = 0; i < 128; ++i) { if (tcopy[i / 64] & (1ull << (i % 64))) buf[idx++] = i; } buf[idx++] = ']'; return buf; } static void __regex_print(struct regex *r) { int i; if (r->submatch_id != -1) printf("%d:", r->submatch_id); switch(r->type) { case rx_char: if (r->c == ' ') printf("' '"); else printf("%c", r->c); return; case rx_any: printf("."); return; case rx_range: printf("%s", print_range(r->table)); return; case rx_concat: printf("(| "); for (i = 0; i < r->childnum; ++i) { __regex_print(r->children[i]); if (i + 1 < r->childnum) printf(" "); else printf(")"); } return; case rx_rep: case rx_rep_nongreedy: printf("(*"); if (r->type == rx_rep_nongreedy) printf("? "); else printf(" "); __regex_print(r->child); printf(")"); return; } } static void regex_print(struct regex *r) { __regex_print(r); printf("\n"); } static void regex_prog_alloc(struct regex_prog *prog) { prog->size = text_size; prog->text = malloc(sizeof(struct ins) * prog->size); prog->len = 0; } static struct ins *prog_get_ins(struct regex_prog *prog, int inc) { struct ins *tmp; if (prog->len == prog->size) error("regex_prog: text section is too big"); tmp = prog->text + prog->len; if (inc) ++prog->len; return tmp; } static void __regex_compile(struct regex_prog *prog, struct regex *r); static void compile_rep(struct regex_prog *prog, struct regex *r) { struct ins *tmp, *tmp2; tmp = prog_get_ins(prog, 1); tmp->opcode = op_split; prog_get_ins(prog, 0); tmp->arg_labels[0] = tmp + 1; __regex_compile(prog, r->child); tmp2 = prog_get_ins(prog, 1); tmp2->opcode = op_jmp; tmp2->arg_labels[0] = tmp; tmp2 = prog_get_ins(prog, 0); tmp->arg_labels[1] = tmp2; if (r->type == rx_rep_nongreedy) { struct ins *tmp3 = tmp->arg_labels[1]; tmp->arg_labels[1] = tmp->arg_labels[0]; tmp->arg_labels[0] = tmp3; } } static void __regex_compile(struct regex_prog *prog, struct regex *r) { struct ins *tmp; int i; if (r->submatch_id != -1) { tmp = prog_get_ins(prog, 1); tmp->opcode = op_save; tmp->n = r->submatch_id * 2; } switch(r->type) { case rx_char: tmp = prog_get_ins(prog, 1); tmp->opcode = op_char; tmp->c = r->c; break; case rx_any: tmp = prog_get_ins(prog, 1); tmp->opcode = op_any; break; case rx_range: tmp = prog_get_ins(prog, 1); tmp->opcode = op_range; tmp->table[0] = r->table[0]; tmp->table[1] = r->table[1]; break; case rx_concat: for (i = 0; i < r->childnum; ++i) __regex_compile(prog, r->children[i]); break; case rx_rep: case rx_rep_nongreedy: compile_rep(prog, r); break; } if (r->submatch_id != -1) { tmp = prog_get_ins(prog, 1); tmp->opcode = op_save; tmp->n = r->submatch_id * 2 + 1; } } static void regex_compile(struct regex_prog *prog, struct regex *r) { struct ins *tmp; regex_prog_alloc(prog); __regex_compile(prog, r); tmp = prog_get_ins(prog, 1); tmp->opcode = op_match; } static void assign_L(int *labels, int offset) { static int next_label = 0; if (labels[offset] == -1) { labels[offset] = next_label; ++next_label; } } static void fill_label_table(int *labels, struct regex_prog *prog) { struct ins *tmp; for (tmp = prog->text; tmp < prog->text + prog->len; ++tmp) { if (tmp->opcode == op_jmp) { assign_L(labels, tmp->arg_labels[0] - prog->text); } if (tmp->opcode == op_split) { assign_L(labels, tmp->arg_labels[0] - prog->text); assign_L(labels, tmp->arg_labels[1] - prog->text); } } } static void print_ins(struct ins *ins, int *labels, struct regex_prog *prog) { switch(ins->opcode) { case op_char: printf("\tchar "); if (ins->c == ' ') printf("' '\n"); else printf("%c\n", ins->c); break; case op_any: printf("\tany\n"); break; case op_range: printf("\trange %s\n", print_range(ins->table)); break; case op_match: printf("\tmatch\n"); break; case op_split: printf("\tsplit L%d, L%d\n", labels[ins->arg_labels[0] - prog->text], labels[ins->arg_labels[1] - prog->text]); break; case op_jmp: printf("\tjmp L%d\n", labels[ins->arg_labels[0] - prog->text]); break; case op_save: printf("\tsave %d\n", ins->n); break; } } static void regex_prog_print(struct regex_prog *prog) { struct ins *tmp; int *labels = malloc(sizeof(int) * prog->len); memset(labels, -1, sizeof(int) * prog->len); fill_label_table(labels, prog); for (tmp = prog->text; tmp < prog->text + prog->len; ++tmp) { int offset = tmp - prog->text; if (labels[offset] != -1) printf("L%d:", labels[offset]); print_ins(tmp, labels, prog); } } static struct threadlist *threadlist_init(struct regex_prog *prog) { struct threadlist *tmp = malloc(sizeof(struct threadlist)); tmp->anchor = &tmp->anchor_body; tmp->anchor->prev = tmp->anchor; tmp->anchor->next = tmp->anchor; tmp->presence_table = malloc(sizeof(struct node *) * prog->len); tmp->ref = prog->text; memset(tmp->presence_table, 0, sizeof(struct node *) * prog->len); return tmp; } static struct node *alloc_node() { struct node *tmp = malloc(sizeof(struct node)); tmp->next = tmp->prev = NULL; return tmp; } static void node_extract(struct node *n) { n->prev->next = n->next; n->next->prev = n->prev; } static void node_addfront(struct threadlist *tl, struct node *n) { n->next = tl->anchor->next; n->prev = tl->anchor; tl->anchor->next->prev = n; tl->anchor->next = n; } static int is_threadlist_empty(struct threadlist *tl) { return tl->anchor->next == tl->anchor; } int adds = 0; int adjustments = 0; static struct node **adjust_if_present(struct threadlist *tl, struct thread *t, int back) { struct node **cell = tl->presence_table + (t->pc - tl->ref); if (*cell == NULL) return cell; if (back) return NULL; node_extract(*cell); node_addfront(tl, *cell); return NULL; } static void threadlist_addfront(struct threadlist *tl, struct thread t) { struct node *tmp; struct node **cell; if ((cell = adjust_if_present(tl, &t, 0)) == NULL) return; ++adds; tmp = alloc_node(); tmp->t = t; node_addfront(tl, tmp); *cell = tmp; } static void threadlist_addback(struct threadlist *tl, struct thread t) { struct node *tmp; struct node **cell; if ((cell = adjust_if_present(tl, &t, 1)) == NULL) return; ++adds; tmp = alloc_node(); tmp->t = t; tmp->prev = tl->anchor->prev; tmp->next = tl->anchor; tl->anchor->prev->next = tmp; tl->anchor->prev = tmp; *cell = tmp; } static int threadlist_getfront(struct threadlist *tl, struct thread *t) { struct node *tmp; if (is_threadlist_empty(tl)) return 0; tmp = tl->anchor->next; *t = tmp->t; node_extract(tmp); tl->presence_table[t->pc - tl->ref] = NULL; return 1; } static struct submatches *alloc_submatches(char **p) { int i; struct submatches *tmp = malloc(sizeof(*tmp)); tmp->refcount = 1; if (p == NULL) { for (i = 0; i < 20; ++i) tmp->saves[i] = NULL; } else { for (i = 0; i < 20; ++i) tmp->saves[i] = p[i]; } return tmp; } static struct thread thread(struct ins *pc) { return (struct thread) { pc, alloc_submatches(NULL) }; } static struct thread thread_copy(struct thread *t) { ++t->submatches->refcount; return *t; } static void save_offset(struct thread *t, int n, char *offset) { if (t->submatches->refcount > 1) { --t->submatches->refcount; t->submatches = alloc_submatches(t->submatches->saves); } t->submatches->saves[n] = offset; } static char **exec_threadlist(struct threadlist *clist, struct threadlist *nlist, char c, char *offset) { struct thread tmp; struct thread tmp2; while (threadlist_getfront(clist, &tmp)) { start: switch(tmp.pc->opcode) { case op_match: if (c == 0) return tmp.submatches->saves; break; case op_char: if (tmp.pc->c == c) { ++tmp.pc; threadlist_addback(nlist, tmp); } break; case op_any: if (c != 0) { ++tmp.pc; threadlist_addback(nlist, tmp); } break; case op_range: if (tmp.pc->table[c / 64] & (1ull << (c % 64))) { ++tmp.pc; threadlist_addback(nlist, tmp); } break; case op_jmp: tmp.pc = tmp.pc->arg_labels[0]; goto start; case op_split: tmp2 = thread_copy(&tmp); tmp2.pc = tmp.pc->arg_labels[1]; tmp.pc = tmp.pc->arg_labels[0]; threadlist_addfront(clist, tmp2); threadlist_addfront(clist, tmp); break; case op_save: save_offset(&tmp, tmp.pc->n, offset); ++tmp.pc; goto start; } } return NULL; } static char **__regex_exec(struct regex_prog *prog, struct str *str) { struct threadlist *clist = threadlist_init(prog); struct threadlist *nlist = threadlist_init(prog); struct threadlist *tmpl; char *tmps = str->start; threadlist_addback(clist, thread(prog->text)); for (; !is_threadlist_empty(clist); ++tmps) { char c = tmps < str->end ? *tmps: 0; char **ret = exec_threadlist(clist, nlist, c, tmps); if (ret != NULL) return ret; if (tmps == str->end) return NULL; tmpl = nlist; nlist = clist; clist = tmpl; } return NULL; } static char **regex_exec(struct regex_prog *prog, struct str *str) { char **ret; heap_offset_save(); ret = __regex_exec(prog, str); heap_offset_restore(); return ret; } static void regex_compile_wrapper(struct regex_prog *prog, struct str *regex) { struct regex *r = regex_parse(*regex); /*regex_print(r);*/ regex_compile(prog, r); /*regex_prog_print(prog);*/ } static void print_saved(char **saves) { int i; if (saves == NULL) { printf("No match\n"); return; } for (i = 0; i < 10; ++i) { struct str tmp = { saves[i * 2], saves[i * 2 + 1] }; printf("%d: %S\n", i, &tmp); } } static void read_part(struct str *part, struct str *s, char *err) { char *tmp = s->start; for (; tmp != s->end && *tmp != '/'; ++tmp) ; if (tmp == s->end) error("%s", err); part->start = s->start; part->end = tmp; s->start = tmp + 1; } static int is_space(char c) { return c == ' ' || c == '\t' || c == '\n'; } static void skip_spaces(struct str *s) { for (; s->start != s->end && is_space(*s->start); ++s->start) ; } static int parse_command(struct str *s) { struct str cmdname; struct compiled_command *ccmd = command_buf + command_num; if (command_num == max_commands) error("too many commands"); skip_spaces(s); if (s->end - s->start == 0) return 0; if (s->start[0] == '/') { ++s->start; read_part(&ccmd->str_condition, s, "command: unfinished condition"); regex_compile_wrapper(&ccmd->condition, &ccmd->str_condition); } else { ccmd->str_condition.start = NULL; ccmd->str_condition.end = NULL; } read_part(&cmdname, s, "command: unfinished command type"); read_part(&ccmd->str_body, s, "command: unfinished body"); regex_compile_wrapper(&ccmd->body, &ccmd->str_body); read_part(&ccmd->str_result, s, "command: unfinished result"); if (streq(&cmdname, &CSTR("s")) == 0) error("%s: command not supported"); ++command_num; return 1; } static void parse_file(char *fname) { char *addr; struct str contents; struct stat st; int fd = open(fname, O_RDONLY, 0); if (fd == -1) goto syserror; if (fstat(fd, &st) == -1) goto syserror; addr = mmap(NULL, st.size, PROT_READ, MAP_PRIVATE, fd, 0); if (addr == (void*)-1) goto syserror; if (close(fd) == -1) goto syserror; contents = (struct str) { addr, addr + st.size }; while (parse_command(&contents)) ; return; syserror: syscall_error(fname); } static void parse_cmdargs(int argc, char **argv) { ++argv; if (*argv == NULL) { printf("No help for you\n"); hexit(1); } else if (strzeq(*argv, "-f")) { if (argv[1] == NULL) error("-f: missing file"); parse_file(argv[1]); argv += 2; if (*argv != NULL) goto unexpected_arg; } else { parse_command(&STR(*argv)); ++argv; if (*argv != NULL) goto unexpected_arg; } return; unexpected_arg: error("%s: unexpected argument", *argv); } static void construct_result(struct str *saves, struct str *result) { char *tmp; printf("%S", saves + 8); for (tmp = result->start; tmp != result->end; ++tmp) { if (*tmp != '\\') { printf("%c", *tmp); } else { if (tmp + 1 == result->end) error("stray '\\' in input"); ++tmp; switch(*tmp) { case '0' ... '7': printf("%S", saves + (*tmp - '0')); break; default: printf("%c", *tmp); } } } printf("%S\n", saves + 9); } static void process_line(struct str *line) { int i; for (i = 0; i < command_num; ++i) { struct compiled_command *cmd = command_buf + i; char **res; if (cmd->str_condition.start != cmd->str_condition.end) { res = regex_exec(&cmd->condition, line); if (res == NULL) continue; } res = regex_exec(&cmd->body, line); if (res == NULL) continue; /* printf("Found: %S|%S\n", &cmd->str_condition, &cmd->str_body); */ construct_result((struct str*)res, &cmd->str_result); return; } printf("%S\n", line); } static void process() { struct str line; while (readline(&line)) process_line(&line); } int main(int argc, char **argv, char **envp) { parse_cmdargs(argc, argv); inbuf_init(); process(); return 0; }