
代码如下
其他的部分还没有完成,只考虑数值解析这一段。主要是从可读性跟性能两个方面看
package compile.craft; import lombok.extern.slf4j.Slf4j; import static compile.craft.CharUtils.*; @Slf4j public class Lexer { boolean fetchedEOF = false; private final String source; private int pos = 0; private char ch; private int line = 0; private int col = 0; public Lexer(String source) { this.source = source; } /** * 1. line 维护 * */ public Token nextToken() { if (fetchedEOF) { return null; } while (true) { do { advance(); } while (isBlank(ch)); if (ch == EOF) { fetchedEOF = true; return null; } // 处理 // 注释 if (isIdentifierStart(ch)) { return scanIdentifier(); } if (ch == '.') { return scanNumber(); } if (CharUtils.isDigit(ch)) { return scanNumber(); } break; } return null; } private Token scanNumber() { int start = pos - 1; TokenKind tokenKind = null; if (ch == '0') { advance(); // HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?; // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?; // HexDigits: HexDigit ((HexDigit | '_')* HexDigit)?; // Digits: [0-9] ([0-9_]* [0-9])?; if (ch == 'x' || ch == 'X') { if (peek() == '.') { advance(); scanHexFraction(false); String lexeme = source.substring(start, pos); return new Token(TokenKind.HEX_FLOAT_LITERAL, lexeme); } else { scanHex(); if (ch == '.') { scanHexFraction(true); tokenKind = TokenKind.HEX_FLOAT_LITERAL; } else if (ch == 'p' || ch == 'P') { scanExp(); tokenKind = TokenKind.HEX_FLOAT_LITERAL; } else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.HEX_FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.HEX_LITERAL; } else { retreat(); tokenKind = TokenKind.HEX_LITERAL; } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } } else if (ch == 'b' || ch == 'B') { // BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?; scanBit(); if ((ch != 'l') && (ch != 'L')) { retreat(); } String lexeme = source.substring(start, pos); return new Token(TokenKind.BINARY_LITERAL, lexeme); } else if (ch == '_' || isOct(ch)) { // OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?; // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); scanOct(); if (isDigit(ch)) { scanDigit(); if (ch == '.') { scanFraction(); } else if (ch == 'e' || ch == 'E') { scanExp(); } else if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { error("invalid oct literal"); } tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'd' || ch == 'D' || ch == 'f' || ch == 'F') { tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.OCT_LITERAL; } else { retreat(); tokenKind = TokenKind.OCT_LITERAL; } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } else { // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); // DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?; // ExponentPart: [eE] [+-]? Digits; if (isDigit(ch)) { do { advance(); } while (isDigit(ch)); if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if ((ch == 'f') || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.FLOAT_LITERAL; } else { error("invalid float literal"); } } else if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.DECIMAL_LITERAL; } else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.FLOAT_LITERAL; } else { retreat(); tokenKind = TokenKind.DECIMAL_LITERAL; } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } } else { // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); // DECIMAL_LITERL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?; // ExponentPart: [eE] [+-]? Digits; if (ch == '.') { scanDigit(); if (ch == 'e' || ch == 'E') { scanExp(); } tokenKind = TokenKind.FLOAT_LITERAL; } else { scanDigit(true); if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.DECIMAL_LITERAL; } else { retreat(); tokenKind = TokenKind.DECIMAL_LITERAL; } } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } } private Token scanIdentifier() { int start = pos - 1; do { advance(); } while (isIdentifierChar(ch)); String lexeme = source.substring(start, pos); retreat(); if (Token.isKeyword(lexeme)) { TokenKind kind = Token.kind(lexeme); return new Token(kind, kind.literal); } else { return new Token(TokenKind.IDENTIFIER, lexeme); } } private void scanHex() { advance(); if (!isHex(ch)) { error("invalid hexadecimal literal"); } do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isHex(ch)) { error("invalid hexadecimal literal"); } } do { advance(); } while (isHex(ch)); } while (ch == '_'); } private void scanBit() { advance(); if (!isBit(ch)) { error("invalid binary literal"); } do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isBit(ch)) { error("invalid binary literal"); } } do { advance(); } while (isBit(ch)); } while (ch == '_'); } private void scanOct() { do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isDigit(ch)) { error("invalid octal literal"); } if (!isOct(ch)) { return; } } do { advance(); } while (isOct(ch)); } while (ch == '_'); } private void scanDigit(boolean hasDigit) { advance(); if (hasDigit) { if ((ch != '_') && !isDigit(ch)) { return; } } else if (!isDigit(ch)) { error("invalid decimal literal"); } do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isDigit(ch)) { error("invalid decimal literal"); } } do { advance(); } while (isDigit(ch)); } while (ch == '_'); } private void scanDigit() { scanDigit(false); } // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; private void scanFraction() { if (isDigit(peek())) { scanDigit(); } else { advance(); } if (ch == 'e' || ch == 'E') { char c = peek(); if (c == '+' || c == '-') { advance(); } scanDigit(); } if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { retreat(); } } // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?; private void scanHexFraction(boolean hasDigit) { if (hasDigit) { if (isHex(peek())) { scanHex(); } else { advance(); } } else { scanHex(); } if ((ch != 'p') && (ch != 'P')) { error("invalid hexadecimal literal"); } char c = peek(); if (c == '+' || c == '-') { advance(); } scanDigit(); if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { retreat(); } } private void scanExp() { char c = peek(); if (c == '+' || c == '-') { advance(); } scanDigit(); if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { retreat(); } } private void advance() { if (pos >= source.length()) { ch = CharUtils.EOF; return; } ch = source.charAt(pos++); } private void retreat() { if (ch != EOF) { if (--pos < 0) { error("tokenizer exceed beginning of source"); } } } private char peek() { if (pos >= source.length()) { return EOF; } return source.charAt(pos); } private void error(String msg) { log.error("lexer error: {}", msg); throw new RuntimeException(msg); } } 1 angryPHP Mar 30, 2023 有没有测试用例啊 |
3 kwh Mar 30, 2023 chatGPT:让我试试 |
4 neptuno Mar 30, 2023 via iPhone 好长 |
5 NeoZephyr OP @angryPHP 有的 ```java def "scan number without exception"() { when: Lexer lexer = new Lexer(source) Token token = lexer.nextToken() then: kind == token.kind.name() lexeme == token.lexeme where: source | kind | lexeme // BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?; "0b0" | "BINARY_LITERAL" | "0b0" "0b1" | "BINARY_LITERAL" | "0b1" "0b000" | "BINARY_LITERAL" | "0b000" "0b010" | "BINARY_LITERAL" | "0b010" "0B0L" | "BINARY_LITERAL" | "0B0L" "0b10___10l" | "BINARY_LITERAL" | "0b10___10l" "0b000___0L" | "BINARY_LITERAL" | "0b000___0L" "0b001___0L" | "BINARY_LITERAL" | "0b001___0L" // OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?; "0000" | "OCT_LITERAL" | "0000" "0__007_2" | "OCT_LITERAL" | "0__007_2" "0__7_0L" | "OCT_LITERAL" | "0__7_0L" "07_0L" | "OCT_LITERAL" | "07_0L" "0000l" | "OCT_LITERAL" | "0000l" "0007l" | "OCT_LITERAL" | "0007l" "0_00_77__0L" | "OCT_LITERAL" | "0_00_77__0L" // HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?; "0x0" | "HEX_LITERAL" | "0x0" "0x000l" | "HEX_LITERAL" | "0x000l" "0x0eeeL" | "HEX_LITERAL" | "0x0eeeL" "0x000eee" | "HEX_LITERAL" | "0x000eee" "0x1_E2E3e5" | "HEX_LITERAL" | "0x1_E2E3e5" "0xe___ee_eL" | "HEX_LITERAL" | "0xe___ee_eL" // DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?; "0L" | "DECIMAL_LITERAL" | "0L" "0" | "DECIMAL_LITERAL" | "0" "9l" | "DECIMAL_LITERAL" | "9l" "99___9L" | "DECIMAL_LITERAL" | "99___9L" "9___99___9L" | "DECIMAL_LITERAL" | "9___99___9L" "9999" | "DECIMAL_LITERAL" | "9999" "1_000_000" | "DECIMAL_LITERAL" | "1_000_000" // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?; "0x.fp012" | "HEX_FLOAT_LITERAL" | "0x.fp012" "0x.f__f_fp-0012d" | "HEX_FLOAT_LITERAL" | "0x.f__f_fp-0012d" "0xf_ffP12d" | "HEX_FLOAT_LITERAL" | "0xf_ffP12d" "0xf_f.P-12d" | "HEX_FLOAT_LITERAL" | "0xf_f.P-12d" "0xf_f.f_fP+12" | "HEX_FLOAT_LITERAL" | "0xf_f.f_fP+12" "0x0.0000000fp-11" | "HEX_FLOAT_LITERAL" | "0x0.0000000fp-11" "0xf_ff.P12d" | "HEX_FLOAT_LITERAL" | "0xf_ff.P12d" "0X0P0f" | "HEX_FLOAT_LITERAL" | "0X0P0f" "0X0P0" | "HEX_FLOAT_LITERAL" | "0X0P0" "0X0_0__123P0f" | "HEX_FLOAT_LITERAL" | "0X0_0__123P0f" "0XeP0f" | "HEX_FLOAT_LITERAL" | "0XeP0f" "0X000.P0f" | "HEX_FLOAT_LITERAL" | "0X000.P0f" "0X00e.P0f" | "HEX_FLOAT_LITERAL" | "0X00e.P0f" "0X0e__0.0P0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0P0f" "0X0e__0.0__0P0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__0P0f" "0X0e__0.0__e0P-0__0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__e0P-0__0f" "0X0e__0.0__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__e0P+0_1__0f" "0X0.0__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0.0__e0P+0_1__0f" "0X0.00P0f" | "HEX_FLOAT_LITERAL" | "0X0.00P0f" "0X0.0eP0f" | "HEX_FLOAT_LITERAL" | "0X0.0eP0f" "0X0.e__00P0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00P0f" "0X0.e__00__0P0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00__0P0f" "0X0.e__00__e0P-0__0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00__e0P-0__0f" "0X0e.0__00__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0e.0__00__e0P+0_1__0f" "0X.0__00__e0P-0_1__0F" | "HEX_FLOAT_LITERAL" | "0X.0__00__e0P-0_1__0F" "0X.0__00__e0P-0_1__0" | "HEX_FLOAT_LITERAL" | "0X.0__00__e0P-0_1__0" // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); // ExponentPart: [eE] [+-]? Digits "0f" | "FLOAT_LITERAL" | "0f" "00f" | "FLOAT_LITERAL" | "00f" "0__0_0f" | "FLOAT_LITERAL" | "0__0_0f" "0001f" | "FLOAT_LITERAL" | "0001f" "0e0f" | "FLOAT_LITERAL" | "0e0f" "0e0" | "FLOAT_LITERAL" | "0e0" "1e1" | "FLOAT_LITERAL" | "1e1" "0_0e-0_0f" | "FLOAT_LITERAL" | "0_0e-0_0f" "0_0e0_120f" | "FLOAT_LITERAL" | "0_0e0_120f" "0001e0009" | "FLOAT_LITERAL" | "0001e0009" "000012345e1" | "FLOAT_LITERAL" | "000012345e1" "0_00___9900d" | "FLOAT_LITERAL" | "0_00___9900d" ".0" | "FLOAT_LITERAL" | ".0" ".0e0" | "FLOAT_LITERAL" | ".0e0" ".0_000" | "FLOAT_LITERAL" | ".0_000" ".0___0990" | "FLOAT_LITERAL" | ".0___0990" ".000e0__0__0" | "FLOAT_LITERAL" | ".000e0__0__0" ".000e0__0__0f" | "FLOAT_LITERAL" | ".000e0__0__0f" ".000e-0__9__0f" | "FLOAT_LITERAL" | ".000e-0__9__0f" ".9e-0__1_0f" | "FLOAT_LITERAL" | ".9e-0__1_0f" "0__00." | "FLOAT_LITERAL" | "0__00." "0__090." | "FLOAT_LITERAL" | "0__090." "99__9." | "FLOAT_LITERAL" | "99__9." "000.000" | "FLOAT_LITERAL" | "000.000" "0__10.090" | "FLOAT_LITERAL" | "0__10.090" "000__1_0.090" | "FLOAT_LITERAL" | "000__1_0.090" "000__1_0.090e12" | "FLOAT_LITERAL" | "000__1_0.090e12" "0__10.090e1__00" | "FLOAT_LITERAL" | "0__10.090e1__00" "3.1415926" | "FLOAT_LITERAL" | "3.1415926" "0.030" | "FLOAT_LITERAL" | "0.030" "0.6" | "FLOAT_LITERAL" | "0.6" ".0f" | "FLOAT_LITERAL" | ".0f" ".0_000f" | "FLOAT_LITERAL" | ".0_000f" ".0___0990f" | "FLOAT_LITERAL" | ".0___0990f" ".000e0__0__0f" | "FLOAT_LITERAL" | ".000e0__0__0f" "0__00.f" | "FLOAT_LITERAL" | "0__00.f" "0__090.f" | "FLOAT_LITERAL" | "0__090.f" "99__9.f" | "FLOAT_LITERAL" | "99__9.f" "000.000f" | "FLOAT_LITERAL" | "000.000f" "0__10.090f" | "FLOAT_LITERAL" | "0__10.090f" "000__1_0.090f" | "FLOAT_LITERAL" | "000__1_0.090f" "000__1_0.090e12f" | "FLOAT_LITERAL" | "000__1_0.090e12f" "0__10.090e1__00f" | "FLOAT_LITERAL" | "0__10.090e1__00f" "3.1415926f" | "FLOAT_LITERAL" | "3.1415926f" "0.030f" | "FLOAT_LITERAL" | "0.030f" "0.6f" | "FLOAT_LITERAL" | "0.6f" } ``` |
8 lucaslee Mar 30, 2023 才看到注释,写那么多代码是在翻译正则? |
9 Leviathann Mar 30, 2023 见不得这种副作用满天飞的代码 |
10 popvlovs Mar 31, 2023 看的我眼花 o_o .... 我只用过 antlr ,从来没考虑过自己手搓 lexer ,这是要拿来练手么? |