judge/dsl/lexer.go

package dsl

import (
	"fmt"
	"strings"
	"unicode"
)

type TokenType int

const (
	TOKEN_STRING TokenType = iota
	TOKEN_IDENT
	TOKEN_FLOAT
	TOKEN_INT
	TOKEN_DURATION

	TOKEN_LBRACE
	TOKEN_RBRACE
	TOKEN_LPAREN
	TOKEN_RPAREN
	TOKEN_ASSIGN
	TOKEN_TILDE

	TOKEN_EOF
)

func (t TokenType) String() string {
	switch t {
	case TOKEN_STRING:
		return "STRING"
	case TOKEN_IDENT:
		return "IDENT"
	case TOKEN_FLOAT:
		return "FLOAT"
	case TOKEN_INT:
		return "INT"
	case TOKEN_DURATION:
		return "DURATION"
	case TOKEN_LBRACE:
		return "{"
	case TOKEN_RBRACE:
		return "}"
	case TOKEN_LPAREN:
		return "("
	case TOKEN_RPAREN:
		return ")"
	case TOKEN_ASSIGN:
		return "="
	case TOKEN_TILDE:
		return "~"
	case TOKEN_EOF:
		return "EOF"
	default:
		return "UNKNOWN"
	}
}

type Token struct {
	Type  TokenType
	Value string
	Line  int
	Col   int
}

func (t Token) String() string {
	return fmt.Sprintf("Token(%s, %q, %d:%d)", t.Type, t.Value, t.Line, t.Col)
}

type Lexer struct {
	src  []rune
	pos  int
	line int
	col  int
}

func NewLexer(src string) *Lexer {
	return &Lexer{src: []rune(src), pos: 0, line: 1, col: 1}
}

func (l *Lexer) peek() (rune, bool) {
	if l.pos >= len(l.src) {
		return 0, false
	}
	return l.src[l.pos], true
}

func (l *Lexer) peekAt(offset int) (rune, bool) {
	i := l.pos + offset
	if i >= len(l.src) {
		return 0, false
	}
	return l.src[i], true
}

func (l *Lexer) advance() rune {
	ch := l.src[l.pos]
	l.pos++
	if ch == '\n' {
		l.line++
		l.col = 1
	} else {
		l.col++
	}
	return ch
}

func (l *Lexer) skipWhitespaceAndComments() {
	for {
		ch, ok := l.peek()
		if !ok {
			return
		}

		if ch == '/' {
			next, ok2 := l.peekAt(1)
			if ok2 && next == '/' {
				for {
					c, ok := l.peek()
					if !ok || c == '\n' {
						break
					}
					l.advance()
				}
				continue
			}
		}

		if unicode.IsSpace(ch) {
			l.advance()
			continue
		}
		break
	}
}

func (l *Lexer) Tokenize() ([]Token, error) {
	var tokens []Token
	for {
		l.skipWhitespaceAndComments()
		ch, ok := l.peek()
		if !ok {
			tokens = append(tokens, Token{Type: TOKEN_EOF, Line: l.line, Col: l.col})
			break
		}

		line, col := l.line, l.col

		switch {
		case ch == '{':
			l.advance()
			tokens = append(tokens, Token{TOKEN_LBRACE, "{", line, col})
		case ch == '}':
			l.advance()
			tokens = append(tokens, Token{TOKEN_RBRACE, "}", line, col})
		case ch == '(':
			l.advance()
			tokens = append(tokens, Token{TOKEN_LPAREN, "(", line, col})
		case ch == ')':
			l.advance()
			tokens = append(tokens, Token{TOKEN_RPAREN, ")", line, col})
		case ch == '=':
			l.advance()
			tokens = append(tokens, Token{TOKEN_ASSIGN, "=", line, col})
		case ch == '~':
			l.advance()
			tokens = append(tokens, Token{TOKEN_TILDE, "~", line, col})

		case ch == '"':
			// проверяем heredoc """
			if l.isHeredocStart() {
				s, err := l.readHeredoc()
				if err != nil {
					return nil, err
				}
				tokens = append(tokens, Token{TOKEN_STRING, s, line, col})
			} else {
				s, err := l.readString()
				if err != nil {
					return nil, err
				}
				tokens = append(tokens, Token{TOKEN_STRING, s, line, col})
			}

		case unicode.IsDigit(ch) || (ch == '-' && l.isNumberNext()):
			tok, err := l.readNumberOrDuration(line, col)
			if err != nil {
				return nil, err
			}
			tokens = append(tokens, tok)

		case unicode.IsLetter(ch) || ch == '_':
			ident := l.readIdent()
			tokens = append(tokens, Token{TOKEN_IDENT, ident, line, col})

		default:
			return nil, fmt.Errorf("%d:%d: unexpected character %q", line, col, ch)
		}
	}

	return tokens, nil
}

func (l *Lexer) isHeredocStart() bool {
	a, ok1 := l.peekAt(0)
	b, ok2 := l.peekAt(1)
	c, ok3 := l.peekAt(2)
	return ok1 && ok2 && ok3 && a == '"' && b == '"' && c == '"'
}

func (l *Lexer) isNumberNext() bool {
	next, ok := l.peekAt(1)
	return ok && unicode.IsDigit(next)
}

func (l *Lexer) readHeredoc() (string, error) {
	l.advance()
	l.advance()
	l.advance()
	var buf strings.Builder
	for {
		if l.pos+2 < len(l.src) &&
			l.src[l.pos] == '"' &&
			l.src[l.pos+1] == '"' &&
			l.src[l.pos+2] == '"' {
			l.advance()
			l.advance()
			l.advance()
			return dedentHeredoc(buf.String()), nil
		}
		ch, ok := l.peek()
		if !ok {
			return "", fmt.Errorf("unterminated heredoc")
		}
		buf.WriteRune(l.advance())
		_ = ch
	}
}

func dedentHeredoc(s string) string {
	lines := strings.Split(s, "\n")

	if len(lines) > 0 && strings.TrimSpace(lines[0]) == "" {
		lines = lines[1:]
	}

	if len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
		lines = lines[:len(lines)-1]
	}

	minIndent := -1
	for _, line := range lines {
		if strings.TrimSpace(line) == "" {
			continue
		}
		indent := len(line) - len(strings.TrimLeft(line, " \t"))
		if minIndent < 0 || indent < minIndent {
			minIndent = indent
		}
	}
	if minIndent < 0 {
		minIndent = 0
	}

	var result strings.Builder
	for i, line := range lines {
		if len(line) >= minIndent {
			result.WriteString(line[minIndent:])
		} else {
			result.WriteString(line)
		}
		if i < len(lines)-1 {
			result.WriteByte('\n')
		}
	}
	return result.String()
}

func (l *Lexer) readString() (string, error) {
	l.advance()
	var buf strings.Builder
	for {
		ch, ok := l.peek()
		if !ok {
			return "", fmt.Errorf("unterminated string at line %d", l.line)
		}
		if ch == '"' {
			l.advance()
			break
		}
		if ch == '\\' {
			l.advance()
			esc, ok := l.peek()
			if !ok {
				return "", fmt.Errorf("unterminated escape")
			}
			l.advance()
			switch esc {
			case 'n':
				buf.WriteByte('\n')
			case 't':
				buf.WriteByte('\t')
			case '\\':
				buf.WriteByte('\\')
			case '"':
				buf.WriteByte('"')
			default:
				return "", fmt.Errorf("unknown escape \\%c", esc)
			}
			continue
		}
		buf.WriteRune(l.advance())
	}
	return buf.String(), nil
}

func (l *Lexer) readIdent() string {
	var buf strings.Builder
	for {
		ch, ok := l.peek()
		if !ok {
			break
		}
		if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' {
			buf.WriteRune(l.advance())
		} else {
			break
		}
	}
	return buf.String()
}

func (l *Lexer) readNumberOrDuration(line, col int) (Token, error) {
	var buf strings.Builder
	isFloat := false

	if ch, _ := l.peek(); ch == '-' {
		buf.WriteRune(l.advance())
	}

	for {
		ch, ok := l.peek()
		if !ok {
			break
		}
		if unicode.IsDigit(ch) {
			buf.WriteRune(l.advance())
		} else if ch == '.' && !isFloat {
			isFloat = true
			buf.WriteRune(l.advance())
		} else {
			break
		}
	}

	suffix := l.tryReadDurationSuffix()
	if suffix != "" {
		return Token{TOKEN_DURATION, buf.String() + suffix, line, col}, nil
	}

	if isFloat {
		return Token{TOKEN_FLOAT, buf.String(), line, col}, nil
	}
	return Token{TOKEN_INT, buf.String(), line, col}, nil
}

func (l *Lexer) tryReadDurationSuffix() string {
	ch, ok := l.peek()
	if !ok {
		return ""
	}
	if ch == 'm' {
		next, ok2 := l.peekAt(1)
		if ok2 && next == 's' {
			l.advance()
			l.advance()
			return "ms"
		}
		l.advance()
		return "m"
	}
	if ch == 's' {
		l.advance()
		return "s"
	}
	return ""
}