|
- // Package shlex provides a simple lexical analysis like Unix shell.
- package shlex
-
- import (
- "bufio"
- "errors"
- "io"
- "strings"
- "unicode"
- )
-
- var (
- ErrNoClosing = errors.New("No closing quotation")
- ErrNoEscaped = errors.New("No escaped character")
- )
-
- // Tokenizer is the interface that classifies a token according to
- // words, whitespaces, quotations, escapes and escaped quotations.
- type Tokenizer interface {
- IsWord(rune) bool
- IsWhitespace(rune) bool
- IsQuote(rune) bool
- IsEscape(rune) bool
- IsEscapedQuote(rune) bool
- }
-
- // DefaultTokenizer implements a simple tokenizer like Unix shell.
- type DefaultTokenizer struct{}
-
- func (t *DefaultTokenizer) IsWord(r rune) bool {
- return r == '_' || unicode.IsLetter(r) || unicode.IsNumber(r)
- }
- func (t *DefaultTokenizer) IsQuote(r rune) bool {
- switch r {
- case '\'', '"':
- return true
- default:
- return false
- }
- }
- func (t *DefaultTokenizer) IsWhitespace(r rune) bool {
- return unicode.IsSpace(r)
- }
- func (t *DefaultTokenizer) IsEscape(r rune) bool {
- return r == '\\'
- }
- func (t *DefaultTokenizer) IsEscapedQuote(r rune) bool {
- return r == '"'
- }
-
- // Lexer represents a lexical analyzer.
- type Lexer struct {
- reader *bufio.Reader
- tokenizer Tokenizer
- posix bool
- whitespacesplit bool
- }
-
- // NewLexer creates a new Lexer reading from io.Reader. This Lexer
- // has a DefaultTokenizer according to posix and whitespacesplit
- // rules.
- func NewLexer(r io.Reader, posix, whitespacesplit bool) *Lexer {
- return &Lexer{
- reader: bufio.NewReader(r),
- tokenizer: &DefaultTokenizer{},
- posix: posix,
- whitespacesplit: whitespacesplit,
- }
- }
-
- // NewLexerString creates a new Lexer reading from a string. This
- // Lexer has a DefaultTokenizer according to posix and whitespacesplit
- // rules.
- func NewLexerString(s string, posix, whitespacesplit bool) *Lexer {
- return NewLexer(strings.NewReader(s), posix, whitespacesplit)
- }
-
- // Split splits a string according to posix or non-posix rules.
- func Split(s string, posix bool) ([]string, error) {
- return NewLexerString(s, posix, true).Split()
- }
-
- // SetTokenizer sets a Tokenizer.
- func (l *Lexer) SetTokenizer(t Tokenizer) {
- l.tokenizer = t
- }
-
- func (l *Lexer) Split() ([]string, error) {
- result := make([]string, 0)
- for {
- token, err := l.readToken()
- if token != "" {
- result = append(result, token)
- }
-
- if err == io.EOF {
- break
- } else if err != nil {
- return result, err
- }
- }
- return result, nil
- }
-
- func (l *Lexer) readToken() (string, error) {
- t := l.tokenizer
- token := ""
- quoted := false
- state := ' '
- escapedstate := ' '
- scanning:
- for {
- next, _, err := l.reader.ReadRune()
- if err != nil {
- if t.IsQuote(state) {
- return token, ErrNoClosing
- } else if t.IsEscape(state) {
- return token, ErrNoEscaped
- }
- return token, err
- }
-
- switch {
- case t.IsWhitespace(state):
- switch {
- case t.IsWhitespace(next):
- break scanning
- case l.posix && t.IsEscape(next):
- escapedstate = 'a'
- state = next
- case t.IsWord(next):
- token += string(next)
- state = 'a'
- case t.IsQuote(next):
- if !l.posix {
- token += string(next)
- }
- state = next
- default:
- token = string(next)
- if l.whitespacesplit {
- state = 'a'
- } else if token != "" || (l.posix && quoted) {
- break scanning
- }
- }
- case t.IsQuote(state):
- quoted = true
- switch {
- case next == state:
- if !l.posix {
- token += string(next)
- break scanning
- } else {
- state = 'a'
- }
- case l.posix && t.IsEscape(next) && t.IsEscapedQuote(state):
- escapedstate = state
- state = next
- default:
- token += string(next)
- }
- case t.IsEscape(state):
- if t.IsQuote(escapedstate) && next != state && next != escapedstate {
- token += string(state)
- }
- token += string(next)
- state = escapedstate
- case t.IsWord(state):
- switch {
- case t.IsWhitespace(next):
- if token != "" || (l.posix && quoted) {
- break scanning
- }
- case l.posix && t.IsQuote(next):
- state = next
- case l.posix && t.IsEscape(next):
- escapedstate = 'a'
- state = next
- case t.IsWord(next) || t.IsQuote(next):
- token += string(next)
- default:
- if l.whitespacesplit {
- token += string(next)
- } else if token != "" {
- l.reader.UnreadRune()
- break scanning
- }
- }
- }
- }
- return token, nil
- }
|