From 3972268b97b789a8212ea9bc7d1b9ce9905a68d8 Mon Sep 17 00:00:00 2001 From: Vladimir Azarov Date: Sun, 13 Oct 2024 20:54:32 +0200 Subject: Base library for tools and sed utility --- .gitignore | 1 + COPYRIGHT | 193 ++++++++ Makefile | 39 +- arch/x86_64/bits/syscall.h.in | 1 - tools/common.c | 427 ++++++++++++++++ tools/common.h | 108 ++++ tools/sed.c | 1089 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1849 insertions(+), 9 deletions(-) create mode 100644 COPYRIGHT create mode 100644 tools/common.c create mode 100644 tools/common.h create mode 100644 tools/sed.c diff --git a/.gitignore b/.gitignore index 0a7aba4..8bfc8ef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.a generated +tools/sed diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 0000000..c1628e9 --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,193 @@ +musl as a whole is licensed under the following standard MIT license: + +---------------------------------------------------------------------- +Copyright © 2005-2020 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- + +Authors/contributors include: + +A. Wilcox +Ada Worcester +Alex Dowad +Alex Suykov +Alexander Monakov +Andre McCurdy +Andrew Kelley +Anthony G. Basile +Aric Belsito +Arvid Picciani +Bartosz Brachaczek +Benjamin Peterson +Bobby Bingham +Boris Brezillon +Brent Cook +Chris Spiegel +Clément Vasseur +Daniel Micay +Daniel Sabogal +Daurnimator +David Carlier +David Edelsohn +Denys Vlasenko +Dmitry Ivanov +Dmitry V. Levin +Drew DeVault +Emil Renner Berthing +Fangrui Song +Felix Fietkau +Felix Janda +Gianluca Anzolin +Hauke Mehrtens +He X +Hiltjo Posthuma +Isaac Dunham +Jaydeep Patil +Jens Gustedt +Jeremy Huntwork +Jo-Philipp Wich +Joakim Sindholt +John Spencer +Julien Ramseier +Justin Cormack +Kaarle Ritvanen +Khem Raj +Kylie McClain +Leah Neukirchen +Luca Barbato +Luka Perkov +M Farkas-Dyck (Strake) +Mahesh Bodapati +Markus Wichmann +Masanori Ogino +Michael Clark +Michael Forney +Mikhail Kremnyov +Natanael Copa +Nicholas J. Kain +orc +Pascal Cuoq +Patrick Oppenlander +Petr Hosek +Petr Skocik +Pierre Carrier +Reini Urban +Rich Felker +Richard Pennington +Ryan Fairfax +Samuel Holland +Segev Finer +Shiz +sin +Solar Designer +Stefan Kristiansson +Stefan O'Rear +Szabolcs Nagy +Timo Teräs +Trutz Behn +Valentin Ochs +Will Dietz +William Haddon +William Pitcock + +Portions of this software are derived from third-party works licensed +under terms compatible with the above MIT license: + +The TRE regular expression implementation (src/regex/reg* and +src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed +under a 2-clause BSD license (license text in the source files). The +included version has been heavily modified by Rich Felker in 2012, in +the interests of size, simplicity, and namespace cleanliness. + +Much of the math library code (src/math/* and src/complex/*) is +Copyright © 1993,2004 Sun Microsystems or +Copyright © 2003-2011 David Schultz or +Copyright © 2003-2009 Steven G. Kargl or +Copyright © 2003-2009 Bruce D. Evans or +Copyright © 2008 Stephen L. Moshier or +Copyright © 2017-2018 Arm Limited +and labelled as such in comments in the individual source files. All +have been licensed under extremely permissive terms. + +The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008 +The Android Open Source Project and is licensed under a two-clause BSD +license. It was taken from Bionic libc, used on Android. + +The AArch64 memcpy and memset code (src/string/aarch64/*) are +Copyright © 1999-2019, Arm Limited. + +The implementation of DES for crypt (src/crypt/crypt_des.c) is +Copyright © 1994 David Burren. It is licensed under a BSD license. + +The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was +originally written by Solar Designer and placed into the public +domain. The code also comes with a fallback permissive license for use +in jurisdictions that may not recognize the public domain. + +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license. + +The x86_64 port was written by Nicholas J. Kain and is licensed under +the standard MIT terms. + +The mips and microblaze ports were originally written by Richard +Pennington for use in the ellcc project. The original code was adapted +by Rich Felker for build system and code conventions during upstream +integration. It is licensed under the standard MIT terms. + +The mips64 port was contributed by Imagination Technologies and is +licensed under the standard MIT terms. + +The powerpc port was also originally written by Richard Pennington, +and later supplemented and integrated by John Spencer. It is licensed +under the standard MIT terms. + +All other files which have no copyright comments are original works +produced specifically for use as part of this library, written either +by Rich Felker, the main author of the library, or by one or more +contibutors listed above. Details on authorship of individual files +can be found in the git version control history of the project. The +omission of copyright and license comments in each file is in the +interest of source tree size. + +In addition, permission is hereby granted for all public header files +(include/* and arch/*/bits/*) and crt files intended to be linked into +applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit +the copyright notice and permission notice otherwise required by the +license, and to use these files without any requirement of +attribution. These files include substantial contributions from: + +Bobby Bingham +John Spencer +Nicholas J. Kain +Rich Felker +Richard Pennington +Stefan Kristiansson +Szabolcs Nagy + +all of whom have explicitly granted such permission. + +This file previously contained text expressing a belief that most of +the files covered by the above exception were sufficiently trivial not +to be subject to copyright, resulting in confusion over whether it +negated the permissions granted in the license. In the spirit of +permissive licensing, and of not having licensing issues being an +obstacle to adoption, that text has been removed. diff --git a/Makefile b/Makefile index aeb1325..113c375 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ AR = ar CC = gcc -CFLAGS = -Wall -g -O3 -march=native +CFLAGS = -O0 -g -Wall arch = x86_64 @@ -41,20 +41,42 @@ all_inc = $(sort $(inc) $(generated_headers:generated/%=%) \ $(arch_inc:arch/$(arch)/%=include/%) \ $(generic_inc:arch/generic/%=include/%)) +dirs = $(sort $(dir $(generated_headers))) + +### Tools ### + +gcc_prefix = $(dir $(shell gcc -print-libgcc-file-name)) + +tool_cflags = -Wall -nostdinc -Wno-main \ + -ffreestanding -fno-pic -fno-stack-protector \ + -Igenerated/include -Iarch/$(arch) \ + -I /usr/lib/clang/17/include \ + -Iarch/generic $(CFLAGS) + +.PRECIOUS: $(addprefix tools/, common.o sed.o) + +tools/%.o: tools/%.c tools/common.h + $(CC) $(tool_cflags) -c -o $@ $< + +tools/%: tools/%.o tools/common.o + ld -L$(gcc_prefix) -nostdlib -o $@ $^ -lgcc + +### Musl ### + default: $(lib_obj) -generated_dirs: - mkdir -p $(dir $(generated_headers)) +$(dirs): + mkdir -p $@ -$(generated_headers): | generated_dirs +$(generated_headers): | $(dirs) generated/include/bits/alltypes.h: arch/$(arch)/bits/alltypes.h.in \ - include/alltypes.h.in - sed -f tools/mkalltypes.sed $^ >$@ + include/alltypes.h.in tools/sed + cat $(filter %.in, $^) | tools/sed -f tools/mkalltypes.sed >$@ -generated/include/bits/syscall.h: arch/$(arch)/bits/syscall.h.in +generated/include/bits/syscall.h: arch/$(arch)/bits/syscall.h.in tools/sed cp $< $@ - sed -n -e 's/__NR_/SYS_/p' < $< >>$@ + tools/sed 's/__NR_/SYS_/' < $< >>$@ src/internal/version.o: cflags += -DVERSION=\"$(shell cat VERSION)\" crt/crt1.o: arch/$(arch)/crt_arch.h @@ -105,3 +127,4 @@ install: install-libs install-headers clean: rm -rf generated rm -f crt/*.o src/*/*.o libc.a crt1.o crti.o crtn.o + rm -f tools/*.o tools/sed diff --git a/arch/x86_64/bits/syscall.h.in b/arch/x86_64/bits/syscall.h.in index 6543bbb..cdacbc5 100644 --- a/arch/x86_64/bits/syscall.h.in +++ b/arch/x86_64/bits/syscall.h.in @@ -361,4 +361,3 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 - diff --git a/tools/common.c b/tools/common.c new file mode 100644 index 0000000..00f7d3c --- /dev/null +++ b/tools/common.c @@ -0,0 +1,427 @@ +#define START "_start" +#include +#include +#include + +#include + +#include "common.h" + +enum { + flush_on_newline = 1 +}; + +int errno = 0; +char *progname; + +static long sysret(unsigned long r) +{ + if (r > -4096UL) { + errno = -r; + return -1; + } + return r; +} + +void exit(int code) +{ + __syscall1(__NR_exit, code); +} + +static void *brk(void *addr) +{ + return (void *)__syscall1(__NR_brk, (long)addr); +} + +int write(int fd, char *buf, int len) +{ + return sysret(__syscall3(__NR_write, fd, (long)buf, len)); +} + +int read(int fd, char *buf, int len) +{ + return sysret(__syscall3(__NR_read, fd, (long)buf, len)); +} + +int open(char *fname, int flags, int mode) +{ + int o_largefile = 0100000; + return sysret(__syscall3(__NR_open, (long)fname, + flags|o_largefile, mode)); +} + +int stat(char *fname, struct stat *st) +{ + return sysret(__syscall2(__NR_stat, (long)fname, (long)st)); +} + +int fstat(int fd, struct stat *st) +{ + return sysret(__syscall2(__NR_fstat, (long)fd, (long)st)); +} + +int close(int fd) +{ + return sysret(__syscall1(__NR_close, fd)); +} + +void *mmap(char *addr, unsigned long length, int prot, int flags, int fd, + unsigned long offset) +{ + return (void*)sysret(__syscall6(__NR_mmap, (long)addr, length, + prot, flags, fd, offset)); +} + +int munmap(char *addr, int length) +{ + return sysret(__syscall2(__NR_munmap, (long)addr, length)); +} + +#define MREMAP_MAYMOVE 1 + +int mremap(void *old_addr, unsigned long old_size, unsigned long new_size, + int flags, void *new_addr) +{ + return sysret(__syscall5(__NR_mremap, (long)old_addr, old_size, + new_size, flags, (long)new_addr)); +} + +int strzlen(char *s) +{ + char *sc = s; + for (; *s; ++s) + ; + return s - sc; +} + +int strzeq(char *s1, char *s2) +{ + for (; *s1 && *s1 == *s2; ++s1, ++s2) + ; + return *s1 == 0 && *s2 == 0; +} + +int streq(struct str *s1, struct str *s2) +{ + int i; + if (s1->end - s1->start != s2->end - s2->start) + return 0; + for (i = 0; s1->start + i < s1->end; ++i) { + if (s1->start[i] != s2->start[i]) + return 0; + } + return 1; +} + +void memmove(char *dest, char *src, int len) +{ + int i; + if (dest < src) { + for (i = 0; i < len; ++i) + dest[i] = src[i]; + } else { + for (i = len - 1; i >= 0; --i) + dest[i] = src[i]; + } +} + +void memcpy(void *dest, void *src, int len) +{ + int i; + char *d = dest; + char *s = src; + for (i = 0; i < len; ++i) + d[i] = s[i]; +} + +void memset(void *dest, char c, int len) +{ + char *tmp = dest; + for (; tmp < (char*)dest + len; ++tmp) + *tmp = c; +} + +#define E(e, s) [e] = s, + +char *estrings[256] = { +#include "../src/errno/__strerror.h" +}; +#undef E + +void perror(char *s) +{ + int idx = errno; + if (errno > 256 || estrings[errno] == NULL) + idx = 0; + eprintf("%s: %s\n", s, estrings[idx]); +} + +void syscall_error(char *fname) +{ + perror(fname); + hexit(1); +} + +enum { + buflen = 32 * 1024 +}; + +struct iobuf { + int fd; + char buf[buflen]; + int len; +}; + +struct iobuf stdout = { 1 }; +struct iobuf stderr = { 2 }; + +static void flush(struct iobuf *buf) +{ + int ret = write(buf->fd, buf->buf, buf->len); + if (ret != buf->len) { + write(2, _S("some stdout/stderr error\n")); + exit(3); + } + buf->len = 0; +} + +static void putchar(struct iobuf *buf, char c) +{ + if (buf->len == buflen) + flush(buf); + buf->buf[buf->len] = c; + ++buf->len; + + if (flush_on_newline && c == '\n') + flush(buf); +} + +void hexit(int code) +{ + flush(&stdout); + flush(&stderr); + exit(code); +} + +static void print_int(struct iobuf *buf, int nsigned, int byte4, + unsigned long n) +{ + char tmpbuf[64]; + char *p = tmpbuf; + unsigned long k; + + if (nsigned) { + if (byte4) { + int tmp = n; + if (tmp < 0) { + putchar(buf, '-'); + tmp = -tmp; + } + k = tmp; + } else { + if ((signed long)n < 0) { + putchar(buf, '-'); + n = -n; + } + k = n; + } + } else { + k = n; + } + + if (k == 0) { + putchar(buf, '0'); + return; + } + + for (; k > 0; k /= 10) { + *p = k % 10 + '0'; + ++p; + } + for (--p;; --p) { + putchar(buf, *p); + if (p == tmpbuf) + break; + } +} + +void print_strz(struct iobuf *buf, char *s) +{ + for (; *s; ++s) + putchar(buf, *s); +} + +void print_str(struct iobuf *buf, struct str *s) +{ + char *tmp = s->start; + for (; tmp != s->end; ++tmp) + putchar(buf, *tmp); +} + +static void format(struct iobuf *buf, char c, va_list vl) +{ + switch(c) { + unsigned long arg; + case '%': + putchar(buf, '%'); + break; + case 'd': + case 'u': + arg = va_arg(vl, unsigned int); + print_int(buf, c == 'd', 1, arg); + break; + case 'D': + case 'U': + arg = va_arg(vl, unsigned long); + print_int(buf, c == 'D', 0, arg); + break; + case 's': + print_strz(buf, va_arg(vl, char*)); + break; + case 'S': + print_str(buf, va_arg(vl, struct str *)); + break; + case 'c': + arg = va_arg(vl, int); + putchar(buf, arg); + break; + default: + write(2, _S("printf: unexpected format option \n")); + hexit(2); + } +} + +static void __printf(struct iobuf *buf, char *s, va_list vl) +{ + for (; *s; ++s) { + if (*s == '%') { + if (s[1] == 0) { + write(2, _S("printf: nothing comes after '%'")); + hexit(2); + } + format(buf, s[1], vl); + ++s; + } else { + putchar(buf, *s); + } + } + return; +} + +void printf(char *fmt, ...) +{ + va_list vl; + va_start(vl, fmt); + __printf(&stdout, fmt, vl); + va_end(vl); +} + +void evprintf(char *fmt, va_list vl) +{ + print_strz(&stderr, progname); + print_strz(&stderr, ": "); + __printf(&stderr, fmt, vl); +} + +void eprintf(char *fmt, ...) +{ + va_list vl; + va_start(vl, fmt); + evprintf(fmt, vl); + va_end(vl); +} + +/* Memory management */ + +struct mem_buffer { + char *cur, *end; + char *saved; +}; + +static struct mem_buffer mem_buf; + +static void mem_buffer_init() +{ + mem_buf.cur = mem_buf.end = brk(NULL); + mem_buf.saved = NULL; +} + +void save_heap_offset() +{ + if (mem_buf.saved != NULL) { + eprintf("mem_buffer: offset already saved\n"); + hexit(2); + } + mem_buf.saved = mem_buf.cur; +} + +void restore_heap_offset() +{ + if (mem_buf.saved == NULL) { + eprintf("mem_buffer: no offset was saved\n"); + hexit(2); + } + mem_buf.cur = mem_buf.saved; + mem_buf.saved = NULL; +} + +static int get_alignment(int n) +{ + if (n >= 16) + return 16; + if (n >= 8) + return 8; + if (n >= 4) + return 4; + if (n >= 2) + return 2; + return 1; +} + +static void mem_buf_grow(int size) +{ + char *tmp; + if (size < 128*1024) { + size = 128*1024; + } else { + size *= 2; + size &= -4096; + } + /*printf("Growing (%d)\n", size);*/ + tmp = brk(mem_buf.end + size); + if (tmp == mem_buf.end) { + eprintf("mem_buf: unable to allocate memory\n"); + hexit(1); + } + mem_buf.end = tmp; +} + +#define ALIGN(v, align) \ + (void*)(((unsigned long)(v) + (align) - 1ul) & -(align)) + +void *malloc(int size) +{ + int align = get_alignment(size); + char *tmp = ALIGN(mem_buf.cur, align); + int diff = tmp + size - mem_buf.end; + + if (diff > 0) + mem_buf_grow(diff); + + mem_buf.cur = tmp + size; + return tmp; +} + +int main(int argc, char **argv, char **envp); + +void _start_c(unsigned long *p) +{ + int argc = p[0]; + char **argv = (void *)(p + 1); + char **envp = argv + argc + 1; + + progname = argv[0]; + mem_buffer_init(); + + hexit(main(argc, argv, envp)); +} diff --git a/tools/common.h b/tools/common.h new file mode 100644 index 0000000..43e2dda --- /dev/null +++ b/tools/common.h @@ -0,0 +1,108 @@ +#ifndef COMMON_H +#define COMMON_H + +#include + +#include + +#define NULL ((void *)0) +#define _S(s) s, (sizeof(s) - 1) +#define CSTR(s) ((struct str) { s, s + sizeof(s) - 1 }) +#define STR(s) ((struct str) { s, s + strzlen(s) }) + +struct str { + char *start; + char *end; +}; + +extern int errno; +extern char *progname; + +void exit(int code); +int read(int fd, char *buf, int len); +int write(int fd, char *buf, int len); + +#define O_RDONLY 0 + +int open(char *fname, int flags, int mode); + +/* + * Taken from linux/arch/x86/include/uapi/asm/stat.h + * Matches arch/x86_64/bits/stat.h + */ + +struct time { + unsigned long sec; + unsigned long usec; +}; + +struct stat { + unsigned long dev; + unsigned long ino; + unsigned long nlink; + + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int __pad0; + + unsigned long rdev; + unsigned long size; + unsigned long blksize; + unsigned long blocks; + + struct time atime; + struct time mtime; + struct time ctime; + long __unused[3]; +}; + +int stat(char *fname, struct stat *st); +int fstat(int fd, struct stat *st); + +int close(int fd); + +#define MAP_PRIVATE 0x02 +#define MAP_ANON 0x20 + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +void *mmap(char *addr, unsigned long length, int prot, int flags, int fd, + unsigned long offset); +int munmap(char *addr, int length); + +#define MREMAP_MAYMOVE 1 + +int mremap(void *old_addr, unsigned long old_size, unsigned long new_size, + int flags, void *new_addr); + +void hexit(int code); + +extern struct iobuf stdout; +extern struct iobuf stderr; + +void print_strz(struct iobuf *buf, char *s); +void print_str(struct iobuf *buf, struct str *s); + +void printf(char *fmt, ...); +void eprintf(char *fmt, ...); +void evprintf(char *fmt, va_list vl); + +void save_heap_offset(); +void restore_heap_offset(); +void *malloc(int size); + +int strzlen(char *s); +int strzeq(char *s1, char *s2); + +int streq(struct str *s1, struct str *s2); +void perror(char *s); +void syscall_error(char *fname); + +void memmove(char *dest, char *src, int len); +void memcpy(void *dest, void *src, int len); +void memset(void *dest, char c, int len); +#endif diff --git a/tools/sed.c b/tools/sed.c new file mode 100644 index 0000000..fd60a07 --- /dev/null +++ b/tools/sed.c @@ -0,0 +1,1089 @@ +#include "common.h" + +enum { + max_commands = 16, + inputbuf_size = 32 * 1024, + regbuf_size = 40, + text_size = 1024, +}; + +struct command { + struct str condition; + struct str body; + struct str result; +}; + +enum regex_type { + rx_char, + rx_any, + rx_range, + rx_rep, + rx_rep_nongreedy, + rx_concat, +}; + +struct regex { + enum regex_type type; + int submatch_id; + union { + char c; + unsigned long long table[2]; + struct regex *child; + struct { + struct regex **children; + int childnum; + }; + }; +}; + +struct regbuf { + struct regex *buf[regbuf_size]; + int size; +}; + +struct parsing_state { + int candidate_id; + char ids[10]; +}; + +enum opcode { + op_char, /* char c */ + op_match, /* match */ + op_any, /* any */ + op_range, /* range [...] */ + op_jmp, /* jmp L1 */ + op_split, /* split L1, L2 */ + op_save, /* save n */ +}; + +struct ins { + enum opcode opcode; + union { + char c; + unsigned long long table[2]; + int n; + struct ins *arg_labels[2]; + }; +}; + +struct regex_prog { + struct ins *text; + int len; + int size; +}; + +struct submatches { + char *saves[20]; + int refcount; +}; + +struct thread { + struct ins *pc; + struct submatches *submatches; +}; + +struct node { + struct node *next; + struct node *prev; + + struct thread t; +}; + +struct threadlist { + struct node *anchor; + + struct node anchor_body; + + struct ins *ref; + struct node **presence_table; + +}; + +struct input_buf { + char buf[inputbuf_size]; + char *start; + char *end; +} input_buf; + +struct compiled_command { + struct str str_condition; + struct regex_prog condition; + + struct str str_body; + struct regex_prog body; + struct str str_result; +}; + +static struct compiled_command command_buf[max_commands]; +static int command_num = 0; + +static void error(char *fmt, ...) +{ + va_list vl; + va_start(vl, fmt); + evprintf(fmt, vl); + va_end(vl); + print_strz(&stderr, "\n"); + hexit(1); +} + +static void inbuf_init() +{ + input_buf.start = input_buf.end = input_buf.buf; +} + +static int readline(struct str *line) +{ + char *start = input_buf.start; + char *tmp; + int none_processed = 1; + + for (;;) { + for (tmp = start; tmp != input_buf.end; ++tmp) { + none_processed = 0; + if (*tmp == '\n') { + line->start = input_buf.start; + line->end = tmp; + input_buf.start = tmp + 1; + return 1; + } + } + if (input_buf.end < input_buf.buf + inputbuf_size / 2) { + int rem, ret; +read: + rem = input_buf.buf + inputbuf_size - input_buf.end; + ret = read(0, input_buf.end, rem); + if (ret == -1) + syscall_error("stdin"); + if (ret == 0) { + if (none_processed) + return 0; + error("unexpected input line end"); + } + start = input_buf.end; + input_buf.end += ret; + } else { + int size; + if (input_buf.start == input_buf.buf) + error("input line is too long"); + size = input_buf.end - input_buf.start; + memmove(input_buf.buf, input_buf.start, size); + input_buf.start = input_buf.buf; + input_buf.end = input_buf.buf + size; + goto read; + } + } +} + +static struct regex *regex_alloc(enum regex_type type) +{ + struct regex *tmp = malloc(sizeof(struct regex)); + tmp->type = type; + tmp->submatch_id = -1; + return tmp; +} + +static void regbuf_add(struct regbuf *buf, struct regex *r) +{ + if (buf->size == regbuf_size) + error("regbuf: no more space"); + buf->buf[buf->size] = r; + ++buf->size; +} + +static struct regex *regbuf_pop(struct regbuf *buf) +{ + if (buf->size == 0) + return NULL; + --buf->size; + return buf->buf[buf->size]; +} + +static char *find_end(struct str *s) +{ + for (; s->start < s->end; ++s->start) { + if (*s->start == ']') { + char *tmp = s->start; + s->start = tmp + 1; + return tmp; + } + } + error("regex_parse: missing ']'"); + return NULL; +} + +static void __parse_brackets(struct regex *r, struct str *s) +{ + unsigned long long constant; + char *tmp; + if (s->start == s->end) + goto error; + if (*s->start == '^') { + constant = -1ull; + ++s->start; + if (s->start == s->end) + goto error; + } else { + constant = 0; + } + r->table[0] = r->table[1] = constant; + for (tmp = s->start; tmp != s->end; ++tmp) { + /* flip bit */ + r->table[*tmp / 64] ^= (1ull << (*tmp % 64)); + } + return; +error: + error("regex_parse: met [] or [^]"); +} + +static void parse_brackets(struct regbuf *buf, struct str *rx) +{ + struct str brackets; + struct regex *r; + + ++rx->start; + brackets = (struct str) { rx->start, find_end(rx) }; + r = regex_alloc(rx_range); + + __parse_brackets(r, &brackets); + regbuf_add(buf, r); +} + +static void parse_rep(struct regbuf *buf, struct str *rx) +{ + enum regex_type type; + struct regex *r, *tmp; + if (rx->start + 1 != rx->end && rx->start[1] == '?') { + type = rx_rep_nongreedy; + rx->start += 2; + } else { + type = rx_rep; + ++rx->start; + } + r = regex_alloc(type); + tmp = regbuf_pop(buf); + if (tmp == NULL) + error("regex_parse: stray '*'"); + r->child = tmp; + regbuf_add(buf, r); +} + +static char *find_pend(struct str *rx) +{ + int level = 0; + for (; rx->start + 1 < rx->end; ++rx->start) { + if (*rx->start == '\\' && rx->start[1] == '(') + ++level; + if (*rx->start == '\\' && rx->start[1] == ')') { + if (level == 0) { + char *tmp = rx->start; + rx->start += 2; + return tmp; + } else { + --level; + } + } + } + error("regex_parse: missing \\)"); + return NULL; +} + +static int pick_id(struct parsing_state *state) +{ + int i; + if (state->candidate_id != -1) { + int tmp = state->candidate_id; + state->ids[tmp] = 0; + state->candidate_id = -1; + return tmp; + } + /* 0, 8 and 9 are for internal use, so they must be requested + * explicitly" + */ + for (i = 1; i < 8; ++i) { + if (state->ids[i] == 1) { + state->ids[i] = 0; + return i; + } + } + error("regex_parse: space of submatch ids is exhausted"); + return -1; +} + +static struct regex *__regex_parse(struct parsing_state *state, + struct str *rx); + +static void parse_parenthesis(struct parsing_state *state, + struct regbuf *buf, struct str *rx) +{ + struct str parenthesis; + struct regex *r; + int id; + + ++rx->start; + id = pick_id(state); + + parenthesis = (struct str) { rx->start, find_pend(rx) }; + r = __regex_parse(state, &parenthesis); + + r->submatch_id = id; + regbuf_add(buf, r); +} + +static void parse_backslash(struct parsing_state *state, + struct regbuf *buf, struct str *rx) +{ + struct regex *r; + if (rx->start + 1 == rx->end) + error("regex_parse: stray '\\'"); + ++rx->start; + + switch(*rx->start) { + case '(': + parse_parenthesis(state, buf, rx); + break; + case '0' ... '9': + state->candidate_id = *rx->start - '0'; + ++rx->start; + break; + default: + r = regex_alloc(rx_char); + r->c = *rx->start; + regbuf_add(buf, r); + ++rx->start; + } +} + +static struct regex *collect(struct regex **buf, int len) +{ + if (len == 0) + error("regex_parse: empty regex"); + if (len == 1) { + return *buf; + } else { + int size; + struct regex *r = regex_alloc(rx_concat); + r->childnum = len; + size = sizeof(struct regex *) * len; + r->children = malloc(size); + memcpy(r->children, buf, size); + return r; + } +} + +static struct regex *__regex_parse(struct parsing_state *state, + struct str *rx) +{ + struct regbuf rbuf; + struct regex *r; + + rbuf.size = 0; + + while (rx->start != rx->end) { + switch(*rx->start) { + case '.': + r = regex_alloc(rx_any); + break; + case '[': + parse_brackets(&rbuf, rx); + continue; + case '*': + parse_rep(&rbuf, rx); + continue; + case '\\': + parse_backslash(state, &rbuf, rx); + continue; + default: + r = regex_alloc(rx_char); + r->c = *rx->start; + } + regbuf_add(&rbuf, r); + ++rx->start; + } + return collect(rbuf.buf, rbuf.size); +} + +static struct regex *regex_parse(struct str rx) +{ + struct parsing_state state; + struct regex *buf[3]; + int idx = 0, inc = 0; + + memset(state.ids, 1, 10); + state.candidate_id = -1; + + if (rx.start != rx.end && *rx.start != '^') + buf[idx++] = __regex_parse(&state, &CSTR("\\8\\(.*?\\)")); + else + ++rx.start; + + if (rx.start != rx.end && rx.end[-1] != '$') { + buf[idx + 1] = __regex_parse(&state, &CSTR("\\9\\(.*\\)")); + inc = 1; + } else { + --rx.end; + } + + buf[idx] = __regex_parse(&state, &rx); + if (buf[idx]->submatch_id != -1) + error("regex_parse: whole regex receives id=0 implicitly"); + buf[idx]->submatch_id = 0; + ++idx; + + idx += inc; + return collect(buf, idx); +} + +static char *print_range(unsigned long long *table) +{ + static char buf[68]; + int idx = 0; + int i; + unsigned long long tcopy[2] = { table[0], table[1] }; + + buf[idx++] = '['; + int raised_bits = __builtin_popcountll(tcopy[0]) + + __builtin_popcountll(tcopy[1]); + if (raised_bits > 64) { + buf[idx++] = '^'; + tcopy[0] = ~tcopy[0]; + tcopy[1] = ~tcopy[1]; + } + for (i = 0; i < 128; ++i) { + if (tcopy[i / 64] & (1ull << (i % 64))) + buf[idx++] = i; + } + buf[idx++] = ']'; + return buf; +} + +static void __regex_print(struct regex *r) +{ + int i; + + if (r->submatch_id != -1) + printf("%d:", r->submatch_id); + switch(r->type) { + case rx_char: + if (r->c == ' ') + printf("' '"); + else + printf("%c", r->c); + return; + case rx_any: + printf("."); + return; + case rx_range: + printf("%s", print_range(r->table)); + return; + case rx_concat: + printf("(| "); + for (i = 0; i < r->childnum; ++i) { + __regex_print(r->children[i]); + if (i + 1 < r->childnum) + printf(" "); + else + printf(")"); + } + return; + case rx_rep: + case rx_rep_nongreedy: + printf("(*"); + if (r->type == rx_rep_nongreedy) + printf("? "); + else + printf(" "); + __regex_print(r->child); + printf(")"); + return; + } +} + +static void regex_print(struct regex *r) +{ + __regex_print(r); + printf("\n"); +} + +static void regex_prog_alloc(struct regex_prog *prog) +{ + prog->size = text_size; + prog->text = malloc(sizeof(struct ins) * prog->size); + prog->len = 0; +} + +static struct ins *prog_get_ins(struct regex_prog *prog, int inc) +{ + struct ins *tmp; + if (prog->len == prog->size) + error("regex_prog: text section is too big"); + tmp = prog->text + prog->len; + if (inc) + ++prog->len; + return tmp; +} + +static void __regex_compile(struct regex_prog *prog, struct regex *r); + +static void compile_rep(struct regex_prog *prog, struct regex *r) +{ + struct ins *tmp, *tmp2; + + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_split; + prog_get_ins(prog, 0); + tmp->arg_labels[0] = tmp + 1; + + __regex_compile(prog, r->child); + tmp2 = prog_get_ins(prog, 1); + tmp2->opcode = op_jmp; + tmp2->arg_labels[0] = tmp; + + tmp2 = prog_get_ins(prog, 0); + tmp->arg_labels[1] = tmp2; + + if (r->type == rx_rep_nongreedy) { + struct ins *tmp3 = tmp->arg_labels[1]; + tmp->arg_labels[1] = tmp->arg_labels[0]; + tmp->arg_labels[0] = tmp3; + } +} + +static void __regex_compile(struct regex_prog *prog, struct regex *r) +{ + struct ins *tmp; + int i; + + if (r->submatch_id != -1) { + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_save; + tmp->n = r->submatch_id * 2; + } + switch(r->type) { + case rx_char: + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_char; + tmp->c = r->c; + break; + case rx_any: + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_any; + break; + case rx_range: + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_range; + tmp->table[0] = r->table[0]; + tmp->table[1] = r->table[1]; + break; + case rx_concat: + for (i = 0; i < r->childnum; ++i) + __regex_compile(prog, r->children[i]); + break; + case rx_rep: + case rx_rep_nongreedy: + compile_rep(prog, r); + break; + } + if (r->submatch_id != -1) { + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_save; + tmp->n = r->submatch_id * 2 + 1; + } +} + +static void regex_compile(struct regex_prog *prog, struct regex *r) +{ + struct ins *tmp; + regex_prog_alloc(prog); + __regex_compile(prog, r); + tmp = prog_get_ins(prog, 1); + tmp->opcode = op_match; +} + +static void assign_L(int *labels, int offset) +{ + static int next_label = 0; + if (labels[offset] == -1) { + labels[offset] = next_label; + ++next_label; + } +} + +static void fill_label_table(int *labels, struct regex_prog *prog) +{ + struct ins *tmp; + for (tmp = prog->text; tmp < prog->text + prog->len; ++tmp) { + if (tmp->opcode == op_jmp) { + assign_L(labels, tmp->arg_labels[0] - prog->text); + } + if (tmp->opcode == op_split) { + assign_L(labels, tmp->arg_labels[0] - prog->text); + assign_L(labels, tmp->arg_labels[1] - prog->text); + } + } +} + +static void print_ins(struct ins *ins, int *labels, + struct regex_prog *prog) +{ + switch(ins->opcode) { + case op_char: + printf("\tchar "); + if (ins->c == ' ') + printf("' '\n"); + else + printf("%c\n", ins->c); + break; + case op_any: + printf("\tany\n"); + break; + case op_range: + printf("\trange %s\n", print_range(ins->table)); + break; + case op_match: + printf("\tmatch\n"); + break; + case op_split: + printf("\tsplit L%d, L%d\n", + labels[ins->arg_labels[0] - prog->text], + labels[ins->arg_labels[1] - prog->text]); + break; + case op_jmp: + printf("\tjmp L%d\n", + labels[ins->arg_labels[0] - prog->text]); + break; + case op_save: + printf("\tsave %d\n", ins->n); + break; + } +} + +static void regex_prog_print(struct regex_prog *prog) +{ + struct ins *tmp; + int *labels = malloc(sizeof(int) * prog->len); + memset(labels, -1, sizeof(int) * prog->len); + fill_label_table(labels, prog); + + for (tmp = prog->text; tmp < prog->text + prog->len; ++tmp) { + int offset = tmp - prog->text; + if (labels[offset] != -1) + printf("L%d:", labels[offset]); + print_ins(tmp, labels, prog); + } +} + +static struct threadlist *threadlist_init(struct regex_prog *prog) +{ + struct threadlist *tmp = malloc(sizeof(struct threadlist)); + + tmp->anchor = &tmp->anchor_body; + tmp->anchor->prev = tmp->anchor; + tmp->anchor->next = tmp->anchor; + + tmp->presence_table = malloc(sizeof(struct node *) * prog->len); + tmp->ref = prog->text; + memset(tmp->presence_table, 0, sizeof(struct node *) * prog->len); + return tmp; +} + +static struct node *alloc_node() +{ + struct node *tmp = malloc(sizeof(struct node)); + tmp->next = tmp->prev = NULL; + return tmp; +} + +static void node_extract(struct node *n) +{ + n->prev->next = n->next; + n->next->prev = n->prev; +} + +static void node_addfront(struct threadlist *tl, struct node *n) +{ + n->next = tl->anchor->next; + n->prev = tl->anchor; + tl->anchor->next->prev = n; + tl->anchor->next = n; +} + +static int is_threadlist_empty(struct threadlist *tl) +{ + return tl->anchor->next == tl->anchor; +} + +int adds = 0; +int adjustments = 0; + +static struct node **adjust_if_present(struct threadlist *tl, + struct thread *t, int back) +{ + struct node **cell = tl->presence_table + (t->pc - tl->ref); + + if (*cell == NULL) + return cell; + if (back) + return NULL; + node_extract(*cell); + node_addfront(tl, *cell); + return NULL; +} + +static void threadlist_addfront(struct threadlist *tl, struct thread t) +{ + struct node *tmp; + struct node **cell; + if ((cell = adjust_if_present(tl, &t, 0)) == NULL) + return; + + ++adds; + tmp = alloc_node(); + tmp->t = t; + node_addfront(tl, tmp); + *cell = tmp; +} + +static void threadlist_addback(struct threadlist *tl, struct thread t) +{ + struct node *tmp; + struct node **cell; + + if ((cell = adjust_if_present(tl, &t, 1)) == NULL) + return; + + ++adds; + tmp = alloc_node(); + tmp->t = t; + + tmp->prev = tl->anchor->prev; + tmp->next = tl->anchor; + tl->anchor->prev->next = tmp; + tl->anchor->prev = tmp; + *cell = tmp; +} + +static int threadlist_getfront(struct threadlist *tl, struct thread *t) +{ + struct node *tmp; + + if (is_threadlist_empty(tl)) + return 0; + tmp = tl->anchor->next; + *t = tmp->t; + + node_extract(tmp); + + tl->presence_table[t->pc - tl->ref] = NULL; + return 1; +} + +static struct submatches *alloc_submatches(char **p) +{ + int i; + struct submatches *tmp = malloc(sizeof(*tmp)); + tmp->refcount = 1; + if (p == NULL) { + for (i = 0; i < 20; ++i) + tmp->saves[i] = NULL; + } else { + for (i = 0; i < 20; ++i) + tmp->saves[i] = p[i]; + } + return tmp; +} + +static struct thread thread(struct ins *pc) +{ + return (struct thread) { pc, alloc_submatches(NULL) }; +} + +static struct thread thread_copy(struct thread *t) +{ + ++t->submatches->refcount; + return *t; +} + +static void save_offset(struct thread *t, int n, char *offset) +{ + if (t->submatches->refcount > 1) { + --t->submatches->refcount; + t->submatches = alloc_submatches(t->submatches->saves); + } + t->submatches->saves[n] = offset; +} + + +static char **exec_threadlist(struct threadlist *clist, + struct threadlist *nlist, char c, char *offset) +{ + struct thread tmp; + while (threadlist_getfront(clist, &tmp)) { +start: + switch(tmp.pc->opcode) { + case op_match: + if (c == 0) + return tmp.submatches->saves; + break; + case op_char: + if (tmp.pc->c == c) { + ++tmp.pc; + threadlist_addback(nlist, tmp); + } + break; + case op_any: + if (c != 0) { + ++tmp.pc; + threadlist_addback(nlist, tmp); + } + break; + case op_range: + if (tmp.pc->table[c / 64] & (1ull << (c % 64))) { + ++tmp.pc; + threadlist_addback(nlist, tmp); + } + break; + case op_jmp: + tmp.pc = tmp.pc->arg_labels[0]; + goto start; + case op_split: + struct thread tmp2 = thread_copy(&tmp); + tmp2.pc = tmp.pc->arg_labels[1]; + tmp.pc = tmp.pc->arg_labels[0]; + threadlist_addfront(clist, tmp2); + threadlist_addfront(clist, tmp); + break; + case op_save: + save_offset(&tmp, tmp.pc->n, offset); + ++tmp.pc; + goto start; + } + } + return NULL; +} + +static char **__regex_exec(struct regex_prog *prog, struct str *str) +{ + struct threadlist *clist = threadlist_init(prog); + struct threadlist *nlist = threadlist_init(prog); + struct threadlist *tmpl; + char *tmps = str->start; + + threadlist_addback(clist, thread(prog->text)); + + for (; !is_threadlist_empty(clist); ++tmps) { + char c = tmps < str->end ? *tmps: 0; + char **ret = exec_threadlist(clist, nlist, c, tmps); + if (ret != NULL) + return ret; + if (tmps == str->end) + return NULL; + tmpl = nlist; + nlist = clist; + clist = tmpl; + } + return NULL; +} + +static char **regex_exec(struct regex_prog *prog, struct str *str) +{ + char **ret; + save_heap_offset(); + ret = __regex_exec(prog, str); + restore_heap_offset(); + return ret; +} + +static void regex_compile_wrapper(struct regex_prog *prog, + struct str *regex) +{ + struct regex *r = regex_parse(*regex); + /*regex_print(r);*/ + regex_compile(prog, r); + /*regex_prog_print(prog);*/ +} + +static void print_saved(char **saves) +{ + int i; + if (saves == NULL) { + printf("No match\n"); + return; + } + for (i = 0; i < 10; ++i) { + struct str tmp = { saves[i * 2], saves[i * 2 + 1] }; + printf("%d: %S\n", i, &tmp); + } +} + +static void read_part(struct str *part, struct str *s, char *err) +{ + char *tmp = s->start; + for (; tmp != s->end && *tmp != '/'; ++tmp) + ; + if (tmp == s->end) + error("%s", err); + part->start = s->start; + part->end = tmp; + s->start = tmp + 1; +} + +static int is_space(char c) +{ + return c == ' ' || c == '\t' || c == '\n'; +} + +static void skip_spaces(struct str *s) +{ + for (; s->start != s->end && is_space(*s->start); ++s->start) + ; +} + +static int parse_command(struct str *s) +{ + struct str cmdname; + + struct compiled_command *ccmd = command_buf + command_num; + + if (command_num == max_commands) + error("too many commands"); + + skip_spaces(s); + if (s->end - s->start == 0) + return 0; + + if (s->start[0] == '/') { + ++s->start; + read_part(&ccmd->str_condition, s, + "command: unfinished condition"); + regex_compile_wrapper(&ccmd->condition, + &ccmd->str_condition); + } else { + ccmd->str_condition.start = NULL; + ccmd->str_condition.end = NULL; + } + read_part(&cmdname, s, "command: unfinished command type"); + read_part(&ccmd->str_body, s, "command: unfinished body"); + regex_compile_wrapper(&ccmd->body, &ccmd->str_body); + + read_part(&ccmd->str_result, s, "command: unfinished result"); + if (streq(&cmdname, &CSTR("s")) == 0) + error("%s: command not supported"); + + ++command_num; + return 1; +} + +static void parse_file(char *fname) +{ + char *addr; + struct str contents; + struct stat st; + int fd = open(fname, O_RDONLY, 0); + + if (fd == -1) + goto syserror; + if (fstat(fd, &st) == -1) + goto syserror; + + addr = mmap(NULL, st.size, PROT_READ, MAP_PRIVATE, fd, 0); + if (addr == (void*)-1) + goto syserror; + if (close(fd) == -1) + goto syserror; + + contents = (struct str) { addr, addr + st.size }; + while (parse_command(&contents)) + ; + return; +syserror: + syscall_error(fname); +} + +static void parse_cmdargs(int argc, char **argv) +{ + ++argv; + if (*argv == NULL) { + printf("No help for you\n"); + hexit(1); + } else if (strzeq(*argv, "-f")) { + if (argv[1] == NULL) + error("-f: missing file"); + parse_file(argv[1]); + argv += 2; + if (*argv != NULL) + goto unexpected_arg; + } else { + parse_command(&STR(*argv)); + ++argv; + if (*argv != NULL) + goto unexpected_arg; + + } + return; +unexpected_arg: + error("%s: unexpected argument", *argv); +} + +static void construct_result(struct str *saves, struct str *result) +{ + char *tmp; + + printf("%S", saves + 8); + for (tmp = result->start; tmp != result->end; ++tmp) { + if (*tmp != '\\') { + printf("%c", *tmp); + } else { + if (tmp + 1 == result->end) + error("stray '\\' in input"); + ++tmp; + switch(*tmp) { + case '0' ... '7': + printf("%S", saves + (*tmp - '0')); + break; + default: + printf("%c", *tmp); + } + } + } + printf("%S\n", saves + 9); +} + +static void process_line(struct str *line) +{ + int i; + for (i = 0; i < command_num; ++i) { + struct compiled_command *cmd = command_buf + i; + char **res; + if (cmd->str_condition.start != cmd->str_condition.end) { + res = regex_exec(&cmd->condition, line); + if (res == NULL) + continue; + } + res = regex_exec(&cmd->body, line); + if (res == NULL) + continue; + /* printf("Found: %S|%S\n", &cmd->str_condition, + &cmd->str_body); */ + construct_result((struct str*)res, &cmd->str_result); + return; + } + printf("%S\n", line); +} + +static void process() +{ + struct str line; + + while (readline(&line)) + process_line(&line); +} + +int main(int argc, char **argv, char **envp) +{ + parse_cmdargs(argc, argv); + inbuf_init(); + process(); + + return 0; +} -- cgit v1.2.3