From 0a8690993d572a50b95dd4f1c1903ed00ddb9c2b Mon Sep 17 00:00:00 2001 From: Charlie Stanton Date: Wed, 21 Sep 2022 21:05:34 +0100 Subject: Initial commit Parses and executes substitute expressions (subexes) So far subex has the following operations: - Concatenation of a and b with ab - Or with | - Repeat maximally with * - Repeat minimally with - - Copy a specific character 'a' - Copy any character '.' - Store text matching a regex into slot 's': `$s(regex)` - Output text in "" including loading from slots with '$' Regexes support all the same operations as subexes minus storing and outputting This first implementation gives very little thought to efficiency Example: ./main 'according to all known laws of aviation' '$1(.-)$m(( .* )| ).*"$m$1"' This swaps the first and last words of the input string --- go.mod | 3 ++ main/lex.go | 34 +++++++++++++ main/main.go | 86 ++++++++++++++++++++++++++++++++ main/parse.go | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main/regexast.go | 72 +++++++++++++++++++++++++++ main/regexstate.go | 48 ++++++++++++++++++ main/subexast.go | 117 ++++++++++++++++++++++++++++++++++++++++++++ main/subexstate.go | 123 ++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 624 insertions(+) create mode 100644 go.mod create mode 100644 main/lex.go create mode 100644 main/main.go create mode 100644 main/parse.go create mode 100644 main/regexast.go create mode 100644 main/regexstate.go create mode 100644 main/subexast.go create mode 100644 main/subexstate.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6967dfd --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module main + +go 1.18 diff --git a/main/lex.go b/main/lex.go new file mode 100644 index 0000000..c08f402 --- /dev/null +++ b/main/lex.go @@ -0,0 +1,34 @@ +package main + +import ( + "unicode/utf8" +) + +const eof rune = -1 +type RuneReader struct { + input string + pos, width int +} +func (l *RuneReader) next() rune { + if l.pos >= len(l.input) { + l.width = 0 + return eof + } + var r rune + r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) + l.pos += l.width + return r +} +func (l *RuneReader) accept(chars string) bool { + r := l.next() + for _, char := range chars { + if char == r { + return true + } + } + l.rewind() + return false +} +func (l *RuneReader) rewind() { + l.pos -= l.width +} diff --git a/main/main.go b/main/main.go new file mode 100644 index 0000000..be43d90 --- /dev/null +++ b/main/main.go @@ -0,0 +1,86 @@ +package main + +import ( + "os" + "fmt" +) + +type TransducerOutput interface { + build(Store) string +} + +type TransducerReplacementRune rune +func (replacement TransducerReplacementRune) build(store Store) string { + return string(replacement) +} + +type TransducerReplacementLoad rune +func (replacement TransducerReplacementLoad) build(store Store) string { + return store[rune(replacement)] +} + +type Store map[rune]string +func (store Store) clone() Store { + newStore := make(Store) + for key, val := range store { + newStore[key] = val + } + return newStore +} + +func compileTransducer(transducerAst SubexAST) SubexState { + return transducerAst.compileWith(SubexNoneState{}) +} + +type SubexBranch struct { + store Store + state SubexState + output string +} +func (pair SubexBranch) eat(char rune) []SubexBranch { + states := pair.state.eat(pair.store, char) + for i := range states { + states[i].output = pair.output + states[i].output + } + return states +} +func (pair SubexBranch) accepting() []string { + return pair.state.accepting(pair.store) +} + +func runTransducer(transducer SubexState, input string) (output string, err bool) { + states := []SubexBranch{{ + state: transducer, + output: "", + store: make(Store), + }} + for _, char := range input { + var newStates []SubexBranch + for _, state := range states { + newStates = append(newStates, state.eat(char)...) + } + states = newStates + } + for _, state := range states { + outputEnds := state.accepting() + for _, outputEnd := range outputEnds { + return state.output + outputEnd, false + } + } + return "", true +} + +func main() { + if len(os.Args) != 3 { + panic("Expected: program [input] [subex]") + } + input := os.Args[1] + program := os.Args[2] + ast := parse(program) + transducer := compileTransducer(ast) + output, err := runTransducer(transducer, input) + if err { + output = input + } + fmt.Println(output) +} diff --git a/main/parse.go b/main/parse.go new file mode 100644 index 0000000..a9bd4b5 --- /dev/null +++ b/main/parse.go @@ -0,0 +1,141 @@ +package main + +func parseReplacement(l *RuneReader) (output []TransducerOutput) { + loop: for { + r := l.next() + switch r { + case eof: + panic("Missing closing \"") + case '"': + break loop + case '$': + slot := l.next() + if slot == eof { + panic("Missing slot character") + } + output = append(output, TransducerReplacementLoad(slot)) + default: + output = append(output, TransducerReplacementRune(r)) + } + } + return output +} + +func parseRegex(l *RuneReader, minPower int) RegexAST { + var lhs RegexAST + r := l.next() + switch r { + case eof: + return nil + case ')', '*', '-', '|': + l.rewind() + return nil + case '(': + lhs = parseRegex(l, 0) + if !l.accept(")") { + panic("Missing matching )") + } + case '.': + lhs = RegexASTAny{} + default: + lhs = RegexASTRune(r) + } + loop: for { + if minPower <= 0 { + next := parseRegex(l, 1) + if next != nil { + lhs = RegexASTConcat{lhs, next} + continue loop + } + } + r := l.next() + switch { + case r == '*' && minPower <= 4: + lhs = RegexASTMaximise{lhs} + case r == '-' && minPower <= 4: + lhs = RegexASTMinimise{lhs} + case r == '|' && minPower <= 2: + rhs := parseRegex(l, 3) + if rhs == nil { + panic("Missing regex after |") + } + lhs = RegexASTOr{lhs, rhs} + default: + l.rewind() + break loop + } + } + return lhs +} + +func parseSubex(l *RuneReader, minPower int) SubexAST { + var lhs SubexAST + r := l.next() + switch r { + case eof: + return nil + case '(': + lhs = parseSubex(l, 0) + if !l.accept(")") { + panic("Missing matching )") + } + case ')', '*', '-', '|': + l.rewind() + return nil + case '$': + slot := l.next() + if slot == eof { + panic("Missing slot character") + } + match := parseRegex(l, 100) + if match == nil { + panic("Missing regex for store") + } + lhs = SubexASTStore{ + match: match, + slot: slot, + } + case '"': + replacement := parseReplacement(l) + lhs = SubexASTOutput{replacement} + case '.': + lhs = SubexASTCopyAny{} + default: + lhs = SubexASTCopyRune(r) + } + loop: for { + if minPower <= 0 { + next := parseSubex(l, 1) + if next != nil { + lhs = SubexASTConcat{lhs, next} + continue loop + } + } + r := l.next() + switch { + case r == '*' && minPower <= 4: + lhs = SubexASTMaximise{lhs} + case r == '-' && minPower <= 4: + lhs = SubexASTMinimise{lhs} + case r == '|' && minPower <= 2: + rhs := parseSubex(l, 3) + if rhs == nil { + panic("Missing subex after |") + } + lhs = SubexASTOr{lhs, rhs} + default: + l.rewind() + break loop + } + } + return lhs +} + +func parse(input string) SubexAST { + l := RuneReader { + input: input, + pos: 0, + width: 0, + } + return parseSubex(&l, 0) +} diff --git a/main/regexast.go b/main/regexast.go new file mode 100644 index 0000000..0aab053 --- /dev/null +++ b/main/regexast.go @@ -0,0 +1,72 @@ +package main + +import ( + "fmt" +) + +type RegexAST interface { + compileWith(next RegexState) RegexState +} + +type RegexASTRune rune +func (ast RegexASTRune) compileWith(next RegexState) RegexState { + return RegexRuneState{ + rune: rune(ast), + next: next, + } +} +func (ast RegexASTRune) String() string { + return string(rune(ast)) +} + +type RegexASTAny struct {} +func (ast RegexASTAny) compileWith(next RegexState) RegexState { + return RegexAnyState{next} +} +func (ast RegexASTAny) String() string { + return "." +} + +type RegexASTConcat struct { + first, second RegexAST +} +func (ast RegexASTConcat) compileWith(next RegexState) RegexState { + return ast.first.compileWith(ast.second.compileWith(next)) +} +func (ast RegexASTConcat) String() string { + return fmt.Sprintf("Concat{%v, %v}", ast.first, ast.second) +} + +type RegexASTOr struct { + first, second RegexAST +} +func (ast RegexASTOr) compileWith(next RegexState) RegexState { + return RegexGroupState{ + ast.first.compileWith(next), + ast.second.compileWith(next), + } +} + +type RegexASTMaximise struct { + content RegexAST +} +func (ast RegexASTMaximise) compileWith(next RegexState) RegexState { + state := &RegexGroupState{ + nil, + next, + } + state.first = ast.content.compileWith(state) + return state +} + +type RegexASTMinimise struct { + content RegexAST +} +func (ast RegexASTMinimise) compileWith(next RegexState) RegexState { + state := &RegexGroupState{ + next, + nil, + } + state.second = ast.content.compileWith(state) + return state +} diff --git a/main/regexstate.go b/main/regexstate.go new file mode 100644 index 0000000..16d5581 --- /dev/null +++ b/main/regexstate.go @@ -0,0 +1,48 @@ +package main + +type RegexState interface { + eat(char rune) []RegexState + accepting() bool +} + +type RegexNoneState struct {} +func (state RegexNoneState) eat(char rune) []RegexState { + return nil +} +func (state RegexNoneState) accepting() bool { + return true +} + +type RegexAnyState struct { + next RegexState +} +func (state RegexAnyState) eat(char rune) []RegexState { + return []RegexState{state.next} +} +func (state RegexAnyState) accepting() bool { + return false +} + +type RegexRuneState struct { + rune rune + next RegexState +} +func (state RegexRuneState) eat(char rune) []RegexState { + if char == state.rune { + return []RegexState{state.next} + } + return nil +} +func (state RegexRuneState) accepting() bool { + return false +} + +type RegexGroupState struct { + first, second RegexState +} +func (state RegexGroupState) eat(char rune) []RegexState { + return append(state.first.eat(char), state.second.eat(char)...) +} +func (state RegexGroupState) accepting() bool { + return state.first.accepting() || state.second.accepting() +} diff --git a/main/subexast.go b/main/subexast.go new file mode 100644 index 0000000..7e2f33c --- /dev/null +++ b/main/subexast.go @@ -0,0 +1,117 @@ +package main + +import ( + "fmt" +) + +type SubexAST interface { + compileWith(next SubexState) SubexState +} + +type SubexASTConcat struct { + first, second SubexAST +} +func (ast SubexASTConcat) compileWith(next SubexState) SubexState { + return ast.first.compileWith(ast.second.compileWith(next)) +} +func (ast SubexASTConcat) String() string { + return fmt.Sprintf("(%v)(%v)", ast.first, ast.second) +} + +type SubexASTStore struct { + match RegexAST + slot rune +} +func (ast SubexASTStore) compileWith(next SubexState) SubexState { + return SubexStoreState { + match: ast.match.compileWith(RegexNoneState{}), + slot: ast.slot, + next: next, + } +} +func (ast SubexASTStore) String() string { + return fmt.Sprintf("$%c(%v)", ast.slot, ast.match) +} + +type SubexASTOr struct { + first, second SubexAST +} +func (ast SubexASTOr) compileWith(next SubexState) SubexState { + return SubexGroupState { + ast.first.compileWith(next), + ast.second.compileWith(next), + } +} + +type SubexASTMaximise struct { + content SubexAST +} +func (ast SubexASTMaximise) compileWith(next SubexState) SubexState { + state := &SubexGroupState { + nil, + next, + } + state.first = ast.content.compileWith(state) + return state +} +func (ast SubexASTMaximise) String() string { + return fmt.Sprintf("(%v)*", ast.content) +} + +type SubexASTMinimise struct { + content SubexAST +} +func (ast SubexASTMinimise) compileWith(next SubexState) SubexState { + state := &SubexGroupState { + next, + nil, + } + state.second = ast.content.compileWith(state) + return state +} +func (ast SubexASTMinimise) String() string { + return fmt.Sprintf("(%v)-", ast.content) +} + +type SubexASTRepeat struct { + content SubexAST + min, max int +} +func (ast SubexASTRepeat) compileWith(next SubexState) SubexState { + for i := ast.min; i < ast.max; i += 1 { + next = SubexGroupState { + ast.content.compileWith(next), + next, + } + } + for i := 0; i < ast.min; i += 1 { + next = ast.content.compileWith(next) + } + return next +} + +type SubexASTCopyRune rune +func (ast SubexASTCopyRune) compileWith(next SubexState) SubexState { + return SubexCopyRuneState{ + rune: rune(ast), + next: next, + } +} + +type SubexASTCopyAny struct {} +func (ast SubexASTCopyAny) compileWith(next SubexState) SubexState { + return SubexCopyAnyState{next} +} +func (ast SubexASTCopyAny) String() string { + return "." +} + +type SubexASTOutput struct { + replacement []TransducerOutput +} +func (ast SubexASTOutput) compileWith(next SubexState) SubexState { + return SubexOutputState{ + content: ast.replacement, + next: next, + } +} diff --git a/main/subexstate.go b/main/subexstate.go new file mode 100644 index 0000000..cc697f0 --- /dev/null +++ b/main/subexstate.go @@ -0,0 +1,123 @@ +package main + +import ( + "strings" +) + +type SubexState interface { + eat(store Store, char rune) []SubexBranch + accepting(store Store) []string +} + +type SubexGroupState struct { + first, second SubexState +} +func (state SubexGroupState) eat(store Store, char rune) []SubexBranch { + otherStore := store.clone() + return append(state.first.eat(store, char), state.second.eat(otherStore, char)...) +} +func (state SubexGroupState) accepting(store Store) []string { + return append(state.first.accepting(store), state.second.accepting(store)...) +} + +type SubexStoreState struct { + match RegexState + slot rune + next SubexState + input string +} +func (state SubexStoreState) eat(store Store, char rune) []SubexBranch { + var nextStates []SubexBranch + if state.match.accepting() { + store[state.slot] = state.input + nextStates = state.next.eat(store, char) + } + nextRegexStates := state.match.eat(char) + for _, regexState := range nextRegexStates { + nextStates = append(nextStates, SubexBranch { + state: SubexStoreState { + match: regexState, + slot: state.slot, + next: state.next, + input: state.input + string(char), + }, + output: "", + store: store.clone(), + }) + } + return nextStates +} +func (state SubexStoreState) accepting(store Store) []string { + if state.match.accepting() { + return state.next.accepting(store) + } + return nil +} + +type SubexOutputState struct { + content []TransducerOutput + next SubexState +} +func (state SubexOutputState) build(store Store) string { + var builder strings.Builder + for _, part := range state.content { + builder.WriteString(part.build(store)) + } + return builder.String() +} +func (state SubexOutputState) eat(store Store, char rune) []SubexBranch { + content := state.build(store) + nextStates := state.next.eat(store, char) + for i := range nextStates { + nextStates[i].output = content + nextStates[i].output + } + return nextStates +} +func (state SubexOutputState) accepting(store Store) []string { + content := state.build(store) + outputs := state.next.accepting(store) + for i := range outputs { + outputs[i] = content + outputs[i] + } + return outputs +} + +type SubexNoneState struct {} +func (state SubexNoneState) eat(store Store, char rune) []SubexBranch { + return nil +} +func (state SubexNoneState) accepting(store Store) []string { + return []string{""} +} + +type SubexCopyRuneState struct { + rune rune + next SubexState +} +func (state SubexCopyRuneState) eat(store Store, char rune) []SubexBranch { + if char == state.rune { + return []SubexBranch{{ + state: state.next, + output: string(char), + store: store, + }} + } + return nil +} +func (state SubexCopyRuneState) accepting(store Store) []string { + return nil +} + +type SubexCopyAnyState struct { + next SubexState +} +func (state SubexCopyAnyState) eat(store Store, char rune) []SubexBranch { + return []SubexBranch{{ + state: state.next, + output: string(char), + store: store, + }} +} +func (state SubexCopyAnyState) accepting(store Store) []string { + return nil +} -- cgit v1.2.3