Implement Lexer (with temporary error handling and basic tests)

2018-11-22 19:54:59 +01:00 · 2018-11-22 19:54:59 +01:00 · da91a5df33
commit da91a5df33
parent 61d40330be
2 changed files with 220 additions and 0 deletions
--- a/core/src/main/java/it/cavallium/warppi/math/rules/dsl/frontend/Lexer.java
+++ b/core/src/main/java/it/cavallium/warppi/math/rules/dsl/frontend/Lexer.java
@ -0,0 +1,138 @@
+package it.cavallium.warppi.math.rules.dsl.frontend;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static it.cavallium.warppi.math.rules.dsl.frontend.TokenType.*;
+
+/**
+ * Converts the source string to a list of tokens.
+ */
+public class Lexer {
+	private static final Map<String, TokenType> keywords = Stream.of(
+			REDUCTION, EXPANSION, CALCULATION, EXISTENCE,
+			ARCCOS, ARCSIN, ARCTAN, COS, SIN, TAN, ROOT, SQRT, LOG,
+			UNDEFINED, PI, E
+	).collect(Collectors.toMap(
+			tokenType -> tokenType.name().toLowerCase(),
+			Function.identity()
+	));
+
+	private final String source;
+	private final List<Token> tokens = new ArrayList<>();
+	private int startOfLexeme = 0;
+	private int curPosition = 0;
+
+	public Lexer(final String source) {
+		this.source = source;
+	}
+
+	public List<Token> lex() {
+		while (!atEnd()) {
+			startOfLexeme = curPosition;
+			lexToken();
+		}
+		tokens.add(new Token(EOF, "", source.length()));
+		return tokens;
+	}
+
+	private void lexToken() {
+		char current = popChar();
+		switch (current) {
+			case ':': emitToken(COLON); break;
+			case ',': emitToken(COMMA); break;
+			case '(': emitToken(LEFT_PAREN); break;
+			case ')': emitToken(RIGHT_PAREN); break;
+			case '[': emitToken(LEFT_BRACKET); break;
+			case ']': emitToken(RIGHT_BRACKET); break;
+			case '=': emitToken(EQUALS); break;
+			case '*': emitToken(TIMES); break;
+			case '/': emitToken(DIVIDE); break;
+			case '^': emitToken(POWER); break;
+
+			case '+':
+				if (matchChar('-')) {
+					emitToken(PLUS_MINUS);
+				} else {
+					emitToken(PLUS);
+				}
+				break;
+
+			case '-':
+				if (matchChar('>')) {
+					emitToken(ARROW);
+				} else {
+					emitToken(MINUS);
+				}
+				break;
+
+			default:
+				if (isAsciiDigit(current)) {
+					number();
+				} else if (Character.isJavaIdentifierStart(current)) {
+					keywordOrIdentifier();
+				} else if (!Character.isWhitespace(current)) {
+					throw new RuntimeException("Unexpected character " + current);
+				}
+		}
+	}
+
+	private void number() {
+		matchWhile(Lexer::isAsciiDigit);
+		if (matchChar('.') && matchWhile(Lexer::isAsciiDigit) == 0) {
+			throw new RuntimeException("Expected digits after decimal separator");
+		}
+		emitToken(NUMBER);
+	}
+
+	private void keywordOrIdentifier() {
+		matchWhile(Character::isJavaIdentifierPart);
+		TokenType type = keywords.getOrDefault(currentLexeme(), IDENTIFIER);
+		emitToken(type);
+	}
+
+	private char popChar() {
+		char current = source.charAt(curPosition);
+		curPosition++;
+		return current;
+	}
+
+	private boolean matchChar(char expected) {
+		if (atEnd() || source.charAt(curPosition) != expected) {
+			return false;
+		}
+		curPosition++;
+		return true;
+	}
+
+	private int matchWhile(Predicate<Character> predicate) {
+		int matched = 0;
+		while (!atEnd() && predicate.test(source.charAt(curPosition))) {
+			curPosition++;
+			matched++;
+		}
+		return matched;
+	}
+
+	private void emitToken(TokenType type) {
+		tokens.add(new Token(type, currentLexeme(), startOfLexeme));
+	}
+
+	private String currentLexeme() {
+		return source.substring(startOfLexeme, curPosition);
+	}
+
+	private boolean atEnd() {
+		return curPosition >= source.length();
+	}
+
+	// Character.isDigit also allows various Unicode digits
+	private static boolean isAsciiDigit(char c) {
+		return '0' <= c && c <= '9';
+	}
+}
--- a/core/src/test/java/it/cavallium/warppi/math/rules/dsl/frontend/LexerTest.java
+++ b/core/src/test/java/it/cavallium/warppi/math/rules/dsl/frontend/LexerTest.java
@ -0,0 +1,82 @@
+package it.cavallium.warppi.math.rules.dsl.frontend;
+
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+import static it.cavallium.warppi.math.rules.dsl.frontend.TokenType.*;
+import static org.junit.Assert.*;
+
+public class LexerTest {
+	@Test
+	public void validRule() {
+		final Lexer lexer = new Lexer(
+				"reduction TestRule_123:\n" +
+						"  x + y * z = -(a_123 +- 3 / 2.2) -> [\n" +
+						"    x^a_123 = cos(pi) - log(e, e),\n" +
+						"    undefined,\n" +
+						"]\n"
+		);
+		final List<Token> expected = Arrays.asList(
+				new Token(REDUCTION, "reduction", 0),
+				new Token(IDENTIFIER, "TestRule_123", 10),
+				new Token(COLON, ":", 22),
+				new Token(IDENTIFIER, "x", 26),
+				new Token(PLUS, "+", 28),
+				new Token(IDENTIFIER, "y", 30),
+				new Token(TIMES, "*", 32),
+				new Token(IDENTIFIER, "z", 34),
+				new Token(EQUALS, "=", 36),
+				new Token(MINUS, "-", 38),
+				new Token(LEFT_PAREN, "(", 39),
+				new Token(IDENTIFIER, "a_123", 40),
+				new Token(PLUS_MINUS, "+-", 46),
+				new Token(NUMBER, "3", 49),
+				new Token(DIVIDE, "/", 51),
+				new Token(NUMBER, "2.2", 53),
+				new Token(RIGHT_PAREN, ")", 56),
+				new Token(ARROW, "->", 58),
+				new Token(LEFT_BRACKET, "[", 61),
+				new Token(IDENTIFIER, "x", 67),
+				new Token(POWER, "^", 68),
+				new Token(IDENTIFIER, "a_123", 69),
+				new Token(EQUALS, "=", 75),
+				new Token(COS, "cos", 77),
+				new Token(LEFT_PAREN, "(", 80),
+				new Token(PI, "pi", 81),
+				new Token(RIGHT_PAREN, ")", 83),
+				new Token(MINUS, "-", 85),
+				new Token(LOG, "log", 87),
+				new Token(LEFT_PAREN, "(", 90),
+				new Token(E, "e", 91),
+				new Token(COMMA, ",", 92),
+				new Token(E, "e", 94),
+				new Token(RIGHT_PAREN, ")", 95),
+				new Token(COMMA, ",", 96),
+				new Token(UNDEFINED, "undefined", 102),
+				new Token(COMMA, ",", 111),
+				new Token(RIGHT_BRACKET, "]", 113),
+				new Token(EOF, "", 115)
+		);
+		assertEquals(expected, lexer.lex());
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void incompleteNumberOtherChar() {
+		final Lexer lexer = new Lexer("2. 5");
+		lexer.lex();
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void incompleteNumberEof() {
+		final Lexer lexer = new Lexer("2.");
+		lexer.lex();
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void meaninglessCharacter() {
+		final Lexer lexer = new Lexer("@");
+		lexer.lex();
+	}
+}