WarpPI/core/src/main/java/it/cavallium/warppi/math/rules/dsl/frontend/Lexer.java

231 lines
6.4 KiB
Java

package it.cavallium.warppi.math.rules.dsl.frontend;
import it.cavallium.warppi.math.rules.dsl.DslError;
import java.util.*;
import java.util.function.Consumer;
import java.util.function.Predicate;
import static it.cavallium.warppi.math.rules.dsl.frontend.TokenType.*;
/**
* Converts the source <code>String</code> to a list of {@link Token}s.
*/
public class Lexer {
private static final Map<String, TokenType> KEYWORDS;
static {
TokenType[] keywordTokenTypes = {
REDUCTION, EXPANSION, CALCULATION, EXISTENCE,
ARCCOS, ARCSIN, ARCTAN, COS, SIN, TAN, ROOT, SQRT, LOG,
UNDEFINED, PI, E
};
Map<String, TokenType> map = new HashMap<>();
for (TokenType type : keywordTokenTypes) {
map.put(type.name().toLowerCase(), type);
}
KEYWORDS = Collections.unmodifiableMap(map);
}
private final String source;
private final Consumer<? super DslError> errorReporter;
private boolean used = false;
private final List<Token> tokens = new ArrayList<>();
private int startOfLexeme = 0;
private int curPosition = 0;
private UnexpectedCharacters unexpectedCharacters = null;
/**
* Constructs a <code>Lexer</code> that will split the given source code into {@link Token}s.
*
* @param source a <code>String</code> containing the DSL source code to process.
* @param errorReporter a <code>Consumer</code> used to report each <code>DslError</code> that the
* <code>Lexer</code> finds within the source string.
*/
public Lexer(final String source, final Consumer<? super DslError> errorReporter) {
this.source = source;
this.errorReporter = errorReporter;
}
/**
* Runs the <code>Lexer</code>.
* <p>
* This method can only be called once per instance.
*
* @return the list of <code>Token</code>s extracted from the source string.
* If any errors are reported, this list should not be considered to represent a valid set of DSL rules,
* but it can still be parsed to potentially find additional errors (which may allow the user to fix more
* errors before having to rerun the <code>Lexer</code>).
* @throws IllegalStateException if called multiple times on the same instance.
*/
public List<Token> lex() {
if (used) {
throw new IllegalStateException("Lexer.lex can only be called once per instance");
}
used = true;
while (!atEnd()) {
startOfLexeme = curPosition;
lexAndHandleErrors();
}
// lexAndHandleErrors reports unexpected characters when they're followed by expected ones:
// if there are unexpected characters at the end of the source, they have to be reported here
reportAndClearUnexpectedCharacters();
tokens.add(new Token(EOF, "", source.length()));
return tokens;
}
private void lexAndHandleErrors() {
try {
lexToken();
reportAndClearUnexpectedCharacters(); // After finding some expected characters
} catch (final SyntaxException e) {
final DslError error = e.getError();
if (error instanceof UnexpectedCharacters) {
addUnexpectedCharacters((UnexpectedCharacters) error);
} else {
// If there are multiple errors, report them in the order in which they occur in the source
reportAndClearUnexpectedCharacters();
errorReporter.accept(error);
}
}
}
private void addUnexpectedCharacters(final UnexpectedCharacters unexpected) {
if (unexpectedCharacters == null) {
unexpectedCharacters = unexpected;
} else {
unexpectedCharacters = unexpectedCharacters.concat(unexpected);
}
}
private void reportAndClearUnexpectedCharacters() {
if (unexpectedCharacters == null) {
return;
}
errorReporter.accept(unexpectedCharacters);
unexpectedCharacters = null;
}
private void lexToken() throws SyntaxException {
char current = popChar();
switch (current) {
case ':': emitToken(COLON); break;
case ',': emitToken(COMMA); break;
case '(': emitToken(LEFT_PAREN); break;
case ')': emitToken(RIGHT_PAREN); break;
case '[': emitToken(LEFT_BRACKET); break;
case ']': emitToken(RIGHT_BRACKET); break;
case '=': emitToken(EQUALS); break;
case '*': emitToken(TIMES); break;
case '^': emitToken(POWER); break;
case '+':
if (matchChar('-')) {
emitToken(PLUS_MINUS);
} else {
emitToken(PLUS);
}
break;
case '-':
if (matchChar('>')) {
emitToken(ARROW);
} else {
emitToken(MINUS);
}
break;
case '/':
if (matchChar('/')) {
singleLineComment();
} else if (matchChar('*')) {
multiLineComment();
} else {
emitToken(DIVIDE);
}
break;
default:
if (isAsciiDigit(current)) {
number();
} else if (Character.isJavaIdentifierStart(current)) {
keywordOrIdentifier();
} else if (!Character.isWhitespace(current)) {
throw new SyntaxException(
new UnexpectedCharacters(curPosition - 1, String.valueOf(current))
);
}
}
}
private void singleLineComment() {
matchWhile(c -> c != '\n');
}
private void multiLineComment() throws SyntaxException {
while (!(matchChar('*') && matchChar('/'))) {
if (atEnd()) {
throw new SyntaxException(new UnterminatedComment(startOfLexeme));
}
popChar();
}
}
private void number() throws SyntaxException {
matchWhile(Lexer::isAsciiDigit);
if (matchChar('.') && matchWhile(Lexer::isAsciiDigit) == 0) {
throw new SyntaxException(
new IncompleteNumberLiteral(startOfLexeme, currentLexeme())
);
}
emitToken(NUMBER);
}
private void keywordOrIdentifier() {
matchWhile(Character::isJavaIdentifierPart);
TokenType type = KEYWORDS.getOrDefault(currentLexeme(), IDENTIFIER);
emitToken(type);
}
private char popChar() {
char current = source.charAt(curPosition);
curPosition++;
return current;
}
private boolean matchChar(char expected) {
if (atEnd() || source.charAt(curPosition) != expected) {
return false;
}
curPosition++;
return true;
}
private int matchWhile(Predicate<Character> predicate) {
int matched = 0;
while (!atEnd() && predicate.test(source.charAt(curPosition))) {
curPosition++;
matched++;
}
return matched;
}
private void emitToken(TokenType type) {
tokens.add(new Token(type, currentLexeme(), startOfLexeme));
}
private String currentLexeme() {
return source.substring(startOfLexeme, curPosition);
}
private boolean atEnd() {
return curPosition >= source.length();
}
// Character.isDigit also allows various Unicode digits
private static boolean isAsciiDigit(char c) {
return '0' <= c && c <= '9';
}
}