blockflow-workbench/src/core/expression/tokenizer.ts

import type { Token } from "./types"

export class ExpressionSyntaxError extends Error {
  constructor(
    message: string,
    readonly position: number
  ) {
    super(`${message} at ${position}`)
    this.name = "ExpressionSyntaxError"
  }
}

const twoCharOperators = new Set(["==", "!=", ">=", "<=", "&&", "||", "??"])
const oneCharOperators = new Set([">", "<", "!"])
const punctuation = new Set([".", "(", ")", "[", "]"])

export function tokenizeExpression(input: string): Token[] {
  const tokens: Token[] = []
  let position = 0

  while (position < input.length) {
    const char = input[position]

    if (char === undefined) {
      break
    }

    if (/\s/.test(char)) {
      position += 1
      continue
    }

    if (char === "\"") {
      const token = readString(input, position)
      tokens.push(token)
      position = token.position + token.value.length + 2
      continue
    }

    if (/[0-9]/.test(char)) {
      const start = position
      position += 1
      while (position < input.length && /[0-9]/.test(input[position] ?? "")) {
        position += 1
      }
      if (input[position] === ".") {
        position += 1
        while (position < input.length && /[0-9]/.test(input[position] ?? "")) {
          position += 1
        }
      }
      tokens.push({ type: "number", value: input.slice(start, position), position: start })
      continue
    }

    if (isIdentifierStart(char)) {
      const start = position
      position += 1
      while (position < input.length && isIdentifierPart(input[position] ?? "")) {
        position += 1
      }
      const value = input.slice(start, position)
      tokens.push({
        type: value === "contains" ? "operator" : "identifier",
        value,
        position: start
      })
      continue
    }

    const twoChars = input.slice(position, position + 2)
    if (twoCharOperators.has(twoChars)) {
      tokens.push({ type: "operator", value: twoChars, position })
      position += 2
      continue
    }

    if (oneCharOperators.has(char)) {
      tokens.push({ type: "operator", value: char, position })
      position += 1
      continue
    }

    if (punctuation.has(char)) {
      tokens.push({ type: "punctuation", value: char, position })
      position += 1
      continue
    }

    throw new ExpressionSyntaxError(`Unexpected character "${char}"`, position)
  }

  tokens.push({ type: "eof", value: "", position: input.length })
  return tokens
}

function readString(input: string, start: number): Token {
  let value = ""
  let position = start + 1

  while (position < input.length) {
    const char = input[position]

    if (char === "\"") {
      return { type: "string", value, position: start }
    }

    if (char === "\\") {
      const escaped = input[position + 1]
      if (escaped === undefined) {
        throw new ExpressionSyntaxError("Unterminated escape sequence", position)
      }
      value += decodeEscape(escaped)
      position += 2
      continue
    }

    value += char
    position += 1
  }

  throw new ExpressionSyntaxError("Unterminated string literal", start)
}

function decodeEscape(char: string): string {
  if (char === "n") return "\n"
  if (char === "r") return "\r"
  if (char === "t") return "\t"
  if (char === "\"") return "\""
  if (char === "\\") return "\\"
  return char
}

function isIdentifierStart(char: string): boolean {
  return /[A-Za-z_$]/.test(char)
}

function isIdentifierPart(char: string): boolean {
  return /[A-Za-z0-9_$-]/.test(char)
}