Update BurntSushi/toml

This commit is contained in:
Frank Denis 2022-01-13 09:22:22 +01:00
parent 351bced7c5
commit 4fd26029c7
14 changed files with 775 additions and 458 deletions

View file

@ -37,28 +37,14 @@ const (
itemInlineTableEnd
)
const (
eof = 0
comma = ','
tableStart = '['
tableEnd = ']'
arrayTableStart = '['
arrayTableEnd = ']'
tableSep = '.'
keySep = '='
arrayStart = '['
arrayEnd = ']'
commentStart = '#'
stringStart = '"'
stringEnd = '"'
rawStringStart = '\''
rawStringEnd = '\''
inlineTableStart = '{'
inlineTableEnd = '}'
)
const eof = 0
type stateFn func(lx *lexer) stateFn
func (p Position) String() string {
return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
}
type lexer struct {
input string
start int
@ -67,26 +53,26 @@ type lexer struct {
state stateFn
items chan item
// Allow for backing up up to four runes.
// This is necessary because TOML contains 3-rune tokens (""" and ''').
// Allow for backing up up to 4 runes. This is necessary because TOML
// contains 3-rune tokens (""" and ''').
prevWidths [4]int
nprev int // how many of prevWidths are in use
// If we emit an eof, we can still back up, but it is not OK to call
// next again.
atEOF bool
nprev int // how many of prevWidths are in use
atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again.
// A stack of state functions used to maintain context.
// The idea is to reuse parts of the state machine in various places.
// For example, values can appear at the top level or within arbitrarily
// nested arrays. The last state on the stack is used after a value has
// been lexed. Similarly for comments.
//
// The idea is to reuse parts of the state machine in various places. For
// example, values can appear at the top level or within arbitrarily nested
// arrays. The last state on the stack is used after a value has been lexed.
// Similarly for comments.
stack []stateFn
}
type item struct {
typ itemType
val string
line int
typ itemType
val string
err error
pos Position
}
func (lx *lexer) nextItem() item {
@ -96,7 +82,7 @@ func (lx *lexer) nextItem() item {
return item
default:
lx.state = lx.state(lx)
//fmt.Printf(" STATE %-24s current: %-10q stack: %s\n", lx.state, lx.current(), lx.stack)
//fmt.Printf(" STATE %-24s current: %-10q stack: %s\n", lx.state, lx.current(), lx.stack)
}
}
}
@ -105,9 +91,9 @@ func lex(input string) *lexer {
lx := &lexer{
input: input,
state: lexTop,
line: 1,
items: make(chan item, 10),
stack: make([]stateFn, 0, 10),
line: 1,
}
return lx
}
@ -129,13 +115,25 @@ func (lx *lexer) current() string {
return lx.input[lx.start:lx.pos]
}
func (lx lexer) getPos() Position {
p := Position{
Line: lx.line,
Start: lx.start,
Len: lx.pos - lx.start,
}
if p.Len <= 0 {
p.Len = 1
}
return p
}
func (lx *lexer) emit(typ itemType) {
lx.items <- item{typ, lx.current(), lx.line}
lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
lx.start = lx.pos
}
func (lx *lexer) emitTrim(typ itemType) {
lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line}
lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
lx.start = lx.pos
}
@ -160,7 +158,13 @@ func (lx *lexer) next() (r rune) {
r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
if r == utf8.RuneError {
lx.errorf("invalid UTF-8 byte at position %d (line %d): 0x%02x", lx.pos, lx.line, lx.input[lx.pos])
lx.error(errLexUTF8{lx.input[lx.pos]})
return utf8.RuneError
}
// Note: don't use peek() here, as this calls next().
if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
lx.errorControlChar(r)
return utf8.RuneError
}
@ -188,6 +192,7 @@ func (lx *lexer) backup() {
lx.prevWidths[1] = lx.prevWidths[2]
lx.prevWidths[2] = lx.prevWidths[3]
lx.nprev--
lx.pos -= w
if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
lx.line--
@ -223,18 +228,58 @@ func (lx *lexer) skip(pred func(rune) bool) {
}
}
// errorf stops all lexing by emitting an error and returning `nil`.
// error stops all lexing by emitting an error and returning `nil`.
//
// Note that any value that is a character is escaped if it's a special
// character (newlines, tabs, etc.).
func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
lx.items <- item{
itemError,
fmt.Sprintf(format, values...),
lx.line,
func (lx *lexer) error(err error) stateFn {
if lx.atEOF {
return lx.errorPrevLine(err)
}
lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
return nil
}
// errorfPrevline is like error(), but sets the position to the last column of
// the previous line.
//
// This is so that unexpected EOF or NL errors don't show on a new blank line.
func (lx *lexer) errorPrevLine(err error) stateFn {
pos := lx.getPos()
pos.Line--
pos.Len = 1
pos.Start = lx.pos - 1
lx.items <- item{typ: itemError, pos: pos, err: err}
return nil
}
// errorPos is like error(), but allows explicitly setting the position.
func (lx *lexer) errorPos(start, length int, err error) stateFn {
pos := lx.getPos()
pos.Start = start
pos.Len = length
lx.items <- item{typ: itemError, pos: pos, err: err}
return nil
}
// errorf is like error, and creates a new error.
func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
if lx.atEOF {
pos := lx.getPos()
pos.Line--
pos.Len = 1
pos.Start = lx.pos - 1
lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
return nil
}
lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
return nil
}
func (lx *lexer) errorControlChar(cc rune) stateFn {
return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
}
// lexTop consumes elements at the top level of TOML data.
func lexTop(lx *lexer) stateFn {
r := lx.next()
@ -242,10 +287,10 @@ func lexTop(lx *lexer) stateFn {
return lexSkip(lx, lexTop)
}
switch r {
case commentStart:
case '#':
lx.push(lexTop)
return lexCommentStart
case tableStart:
case '[':
return lexTableStart
case eof:
if lx.pos > lx.start {
@ -268,7 +313,7 @@ func lexTop(lx *lexer) stateFn {
func lexTopEnd(lx *lexer) stateFn {
r := lx.next()
switch {
case r == commentStart:
case r == '#':
// a comment will read to a newline for us.
lx.push(lexTop)
return lexCommentStart
@ -292,7 +337,7 @@ func lexTopEnd(lx *lexer) stateFn {
// It also handles the case that this is an item in an array of tables.
// e.g., '[[name]]'.
func lexTableStart(lx *lexer) stateFn {
if lx.peek() == arrayTableStart {
if lx.peek() == '[' {
lx.next()
lx.emit(itemArrayTableStart)
lx.push(lexArrayTableEnd)
@ -309,10 +354,8 @@ func lexTableEnd(lx *lexer) stateFn {
}
func lexArrayTableEnd(lx *lexer) stateFn {
if r := lx.next(); r != arrayTableEnd {
return lx.errorf(
"expected end of table array name delimiter %q, but got %q instead",
arrayTableEnd, r)
if r := lx.next(); r != ']' {
return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
}
lx.emit(itemArrayTableEnd)
return lexTopEnd
@ -321,11 +364,11 @@ func lexArrayTableEnd(lx *lexer) stateFn {
func lexTableNameStart(lx *lexer) stateFn {
lx.skip(isWhitespace)
switch r := lx.peek(); {
case r == tableEnd || r == eof:
case r == ']' || r == eof:
return lx.errorf("unexpected end of table name (table names cannot be empty)")
case r == tableSep:
case r == '.':
return lx.errorf("unexpected table separator (table names cannot be empty)")
case r == stringStart || r == rawStringStart:
case r == '"' || r == '\'':
lx.ignore()
lx.push(lexTableNameEnd)
return lexQuotedName
@ -342,10 +385,10 @@ func lexTableNameEnd(lx *lexer) stateFn {
switch r := lx.next(); {
case isWhitespace(r):
return lexTableNameEnd
case r == tableSep:
case r == '.':
lx.ignore()
return lexTableNameStart
case r == tableEnd:
case r == ']':
return lx.pop()
default:
return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
@ -379,10 +422,10 @@ func lexQuotedName(lx *lexer) stateFn {
switch {
case isWhitespace(r):
return lexSkip(lx, lexValue)
case r == stringStart:
case r == '"':
lx.ignore() // ignore the '"'
return lexString
case r == rawStringStart:
case r == '\'':
lx.ignore() // ignore the "'"
return lexRawString
case r == eof:
@ -400,7 +443,7 @@ func lexKeyStart(lx *lexer) stateFn {
return lx.errorf("unexpected '=': key name appears blank")
case r == '.':
return lx.errorf("unexpected '.': keys cannot start with a '.'")
case r == stringStart || r == rawStringStart:
case r == '"' || r == '\'':
lx.ignore()
fallthrough
default: // Bare key
@ -416,7 +459,7 @@ func lexKeyNameStart(lx *lexer) stateFn {
return lx.errorf("unexpected '='")
case r == '.':
return lx.errorf("unexpected '.'")
case r == stringStart || r == rawStringStart:
case r == '"' || r == '\'':
lx.ignore()
lx.push(lexKeyEnd)
return lexQuotedName
@ -434,7 +477,7 @@ func lexKeyEnd(lx *lexer) stateFn {
case isWhitespace(r):
return lexSkip(lx, lexKeyEnd)
case r == eof:
return lx.errorf("unexpected EOF; expected key separator %q", keySep)
return lx.errorf("unexpected EOF; expected key separator '='")
case r == '.':
lx.ignore()
return lexKeyNameStart
@ -461,17 +504,17 @@ func lexValue(lx *lexer) stateFn {
return lexNumberOrDateStart
}
switch r {
case arrayStart:
case '[':
lx.ignore()
lx.emit(itemArray)
return lexArrayValue
case inlineTableStart:
case '{':
lx.ignore()
lx.emit(itemInlineTableStart)
return lexInlineTableValue
case stringStart:
if lx.accept(stringStart) {
if lx.accept(stringStart) {
case '"':
if lx.accept('"') {
if lx.accept('"') {
lx.ignore() // Ignore """
return lexMultilineString
}
@ -479,9 +522,9 @@ func lexValue(lx *lexer) stateFn {
}
lx.ignore() // ignore the '"'
return lexString
case rawStringStart:
if lx.accept(rawStringStart) {
if lx.accept(rawStringStart) {
case '\'':
if lx.accept('\'') {
if lx.accept('\'') {
lx.ignore() // Ignore """
return lexMultilineRawString
}
@ -520,14 +563,12 @@ func lexArrayValue(lx *lexer) stateFn {
switch {
case isWhitespace(r) || isNL(r):
return lexSkip(lx, lexArrayValue)
case r == commentStart:
case r == '#':
lx.push(lexArrayValue)
return lexCommentStart
case r == comma:
case r == ',':
return lx.errorf("unexpected comma")
case r == arrayEnd:
// NOTE(caleb): The spec isn't clear about whether you can have
// a trailing comma or not, so we'll allow it.
case r == ']':
return lexArrayEnd
}
@ -540,22 +581,20 @@ func lexArrayValue(lx *lexer) stateFn {
// the next value (or the end of the array): it ignores whitespace and newlines
// and expects either a ',' or a ']'.
func lexArrayValueEnd(lx *lexer) stateFn {
r := lx.next()
switch {
switch r := lx.next(); {
case isWhitespace(r) || isNL(r):
return lexSkip(lx, lexArrayValueEnd)
case r == commentStart:
case r == '#':
lx.push(lexArrayValueEnd)
return lexCommentStart
case r == comma:
case r == ',':
lx.ignore()
return lexArrayValue // move on to the next value
case r == arrayEnd:
case r == ']':
return lexArrayEnd
default:
return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
}
return lx.errorf(
"expected a comma or array terminator %q, but got %s instead",
arrayEnd, runeOrEOF(r))
}
// lexArrayEnd finishes the lexing of an array.
@ -574,13 +613,13 @@ func lexInlineTableValue(lx *lexer) stateFn {
case isWhitespace(r):
return lexSkip(lx, lexInlineTableValue)
case isNL(r):
return lx.errorf("newlines not allowed within inline tables")
case r == commentStart:
return lx.errorPrevLine(errLexInlineTableNL{})
case r == '#':
lx.push(lexInlineTableValue)
return lexCommentStart
case r == comma:
case r == ',':
return lx.errorf("unexpected comma")
case r == inlineTableEnd:
case r == '}':
return lexInlineTableEnd
}
lx.backup()
@ -596,23 +635,21 @@ func lexInlineTableValueEnd(lx *lexer) stateFn {
case isWhitespace(r):
return lexSkip(lx, lexInlineTableValueEnd)
case isNL(r):
return lx.errorf("newlines not allowed within inline tables")
case r == commentStart:
return lx.errorPrevLine(errLexInlineTableNL{})
case r == '#':
lx.push(lexInlineTableValueEnd)
return lexCommentStart
case r == comma:
case r == ',':
lx.ignore()
lx.skip(isWhitespace)
if lx.peek() == '}' {
return lx.errorf("trailing comma not allowed in inline tables")
}
return lexInlineTableValue
case r == inlineTableEnd:
case r == '}':
return lexInlineTableEnd
default:
return lx.errorf(
"expected a comma or an inline table terminator %q, but got %s instead",
inlineTableEnd, runeOrEOF(r))
return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
}
}
@ -638,14 +675,12 @@ func lexString(lx *lexer) stateFn {
switch {
case r == eof:
return lx.errorf(`unexpected EOF; expected '"'`)
case isControl(r) || r == '\r':
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
case isNL(r):
return lx.errorf("strings cannot contain newlines")
return lx.errorPrevLine(errLexStringNL{})
case r == '\\':
lx.push(lexString)
return lexStringEscape
case r == stringEnd:
case r == '"':
lx.backup()
lx.emit(itemString)
lx.next()
@ -660,23 +695,20 @@ func lexString(lx *lexer) stateFn {
func lexMultilineString(lx *lexer) stateFn {
r := lx.next()
switch r {
default:
return lexMultilineString
case eof:
return lx.errorf(`unexpected EOF; expected '"""'`)
case '\r':
if lx.peek() != '\n' {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineString
case '\\':
return lexMultilineStringEscape
case stringEnd:
case '"':
/// Found " → try to read two more "".
if lx.accept(stringEnd) {
if lx.accept(stringEnd) {
if lx.accept('"') {
if lx.accept('"') {
/// Peek ahead: the string can contain " and "", including at the
/// end: """str"""""
/// 6 or more at the end, however, is an error.
if lx.peek() == stringEnd {
if lx.peek() == '"' {
/// Check if we already lexed 5 's; if so we have 6 now, and
/// that's just too many man!
if strings.HasSuffix(lx.current(), `"""""`) {
@ -699,12 +731,8 @@ func lexMultilineString(lx *lexer) stateFn {
}
lx.backup()
}
return lexMultilineString
}
if isControl(r) {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineString
}
// lexRawString consumes a raw string. Nothing can be escaped in such a string.
@ -712,20 +740,19 @@ func lexMultilineString(lx *lexer) stateFn {
func lexRawString(lx *lexer) stateFn {
r := lx.next()
switch {
default:
return lexRawString
case r == eof:
return lx.errorf(`unexpected EOF; expected "'"`)
case isControl(r) || r == '\r':
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
case isNL(r):
return lx.errorf("strings cannot contain newlines")
case r == rawStringEnd:
return lx.errorPrevLine(errLexStringNL{})
case r == '\'':
lx.backup()
lx.emit(itemRawString)
lx.next()
lx.ignore()
return lx.pop()
}
return lexRawString
}
// lexMultilineRawString consumes a raw string. Nothing can be escaped in such
@ -734,21 +761,18 @@ func lexRawString(lx *lexer) stateFn {
func lexMultilineRawString(lx *lexer) stateFn {
r := lx.next()
switch r {
default:
return lexMultilineRawString
case eof:
return lx.errorf(`unexpected EOF; expected "'''"`)
case '\r':
if lx.peek() != '\n' {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineRawString
case rawStringEnd:
case '\'':
/// Found ' → try to read two more ''.
if lx.accept(rawStringEnd) {
if lx.accept(rawStringEnd) {
if lx.accept('\'') {
if lx.accept('\'') {
/// Peek ahead: the string can contain ' and '', including at the
/// end: '''str'''''
/// 6 or more at the end, however, is an error.
if lx.peek() == rawStringEnd {
if lx.peek() == '\'' {
/// Check if we already lexed 5 's; if so we have 6 now, and
/// that's just too many man!
if strings.HasSuffix(lx.current(), "'''''") {
@ -771,12 +795,8 @@ func lexMultilineRawString(lx *lexer) stateFn {
}
lx.backup()
}
return lexMultilineRawString
}
if isControl(r) {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineRawString
}
// lexMultilineStringEscape consumes an escaped character. It assumes that the
@ -817,8 +837,7 @@ func lexStringEscape(lx *lexer) stateFn {
case 'U':
return lexLongUnicodeEscape
}
return lx.errorf("invalid escape character %q; only the following escape characters are allowed: "+
`\b, \t, \n, \f, \r, \", \\, \uXXXX, and \UXXXXXXXX`, r)
return lx.error(errLexEscape{r})
}
func lexShortUnicodeEscape(lx *lexer) stateFn {
@ -1108,8 +1127,6 @@ func lexComment(lx *lexer) stateFn {
lx.backup()
lx.emit(itemText)
return lx.pop()
case isControl(r):
return lx.errorf("control characters are not allowed inside comments: '0x%02x'", r)
default:
return lexComment
}
@ -1121,52 +1138,6 @@ func lexSkip(lx *lexer, nextState stateFn) stateFn {
return nextState
}
// isWhitespace returns true if `r` is a whitespace character according
// to the spec.
func isWhitespace(r rune) bool {
return r == '\t' || r == ' '
}
func isNL(r rune) bool {
return r == '\n' || r == '\r'
}
// Control characters except \n, \t
func isControl(r rune) bool {
switch r {
case '\t', '\r', '\n':
return false
default:
return (r >= 0x00 && r <= 0x1f) || r == 0x7f
}
}
func isDigit(r rune) bool {
return r >= '0' && r <= '9'
}
func isHexadecimal(r rune) bool {
return (r >= '0' && r <= '9') ||
(r >= 'a' && r <= 'f') ||
(r >= 'A' && r <= 'F')
}
func isOctal(r rune) bool {
return r >= '0' && r <= '7'
}
func isBinary(r rune) bool {
return r == '0' || r == '1'
}
func isBareKeyChar(r rune) bool {
return (r >= 'A' && r <= 'Z') ||
(r >= 'a' && r <= 'z') ||
(r >= '0' && r <= '9') ||
r == '_' ||
r == '-'
}
func (s stateFn) String() string {
name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
if i := strings.LastIndexByte(name, '.'); i > -1 {
@ -1223,3 +1194,26 @@ func (itype itemType) String() string {
func (item item) String() string {
return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
}
func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
func isNL(r rune) bool { return r == '\n' || r == '\r' }
func isControl(r rune) bool { // Control characters except \t, \r, \n
switch r {
case '\t', '\r', '\n':
return false
default:
return (r >= 0x00 && r <= 0x1f) || r == 0x7f
}
}
func isDigit(r rune) bool { return r >= '0' && r <= '9' }
func isBinary(r rune) bool { return r == '0' || r == '1' }
func isOctal(r rune) bool { return r >= '0' && r <= '7' }
func isHexadecimal(r rune) bool {
return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
}
func isBareKeyChar(r rune) bool {
return (r >= 'A' && r <= 'Z') ||
(r >= 'a' && r <= 'z') ||
(r >= '0' && r <= '9') ||
r == '_' || r == '-'
}