有没有好心人帮忙 review 一下, Java 数字的词法解析实现 - V2EX
V2EX = way to explore
V2EX 是一个关于分享和探索的地方
Sign Up Now
For Existing Member  Sign In
NeoZephyr
V2EX    Java

有没有好心人帮忙 review 一下, Java 数字的词法解析实现

  •  
  •   NeoZephyr Mar 30, 2023 2449 views
    This topic created in 1126 days ago, the information mentioned may be changed or developed.

    代码如下

    其他的部分还没有完成,只考虑数值解析这一段。主要是从可读性跟性能两个方面看

    package compile.craft; import lombok.extern.slf4j.Slf4j; import static compile.craft.CharUtils.*; @Slf4j public class Lexer { boolean fetchedEOF = false; private final String source; private int pos = 0; private char ch; private int line = 0; private int col = 0; public Lexer(String source) { this.source = source; } /** * 1. line 维护 * */ public Token nextToken() { if (fetchedEOF) { return null; } while (true) { do { advance(); } while (isBlank(ch)); if (ch == EOF) { fetchedEOF = true; return null; } // 处理 // 注释 if (isIdentifierStart(ch)) { return scanIdentifier(); } if (ch == '.') { return scanNumber(); } if (CharUtils.isDigit(ch)) { return scanNumber(); } break; } return null; } private Token scanNumber() { int start = pos - 1; TokenKind tokenKind = null; if (ch == '0') { advance(); // HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?; // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?; // HexDigits: HexDigit ((HexDigit | '_')* HexDigit)?; // Digits: [0-9] ([0-9_]* [0-9])?; if (ch == 'x' || ch == 'X') { if (peek() == '.') { advance(); scanHexFraction(false); String lexeme = source.substring(start, pos); return new Token(TokenKind.HEX_FLOAT_LITERAL, lexeme); } else { scanHex(); if (ch == '.') { scanHexFraction(true); tokenKind = TokenKind.HEX_FLOAT_LITERAL; } else if (ch == 'p' || ch == 'P') { scanExp(); tokenKind = TokenKind.HEX_FLOAT_LITERAL; } else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.HEX_FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.HEX_LITERAL; } else { retreat(); tokenKind = TokenKind.HEX_LITERAL; } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } } else if (ch == 'b' || ch == 'B') { // BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?; scanBit(); if ((ch != 'l') && (ch != 'L')) { retreat(); } String lexeme = source.substring(start, pos); return new Token(TokenKind.BINARY_LITERAL, lexeme); } else if (ch == '_' || isOct(ch)) { // OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?; // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); scanOct(); if (isDigit(ch)) { scanDigit(); if (ch == '.') { scanFraction(); } else if (ch == 'e' || ch == 'E') { scanExp(); } else if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { error("invalid oct literal"); } tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'd' || ch == 'D' || ch == 'f' || ch == 'F') { tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.OCT_LITERAL; } else { retreat(); tokenKind = TokenKind.OCT_LITERAL; } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } else { // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); // DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?; // ExponentPart: [eE] [+-]? Digits; if (isDigit(ch)) { do { advance(); } while (isDigit(ch)); if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if ((ch == 'f') || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.FLOAT_LITERAL; } else { error("invalid float literal"); } } else if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.DECIMAL_LITERAL; } else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.FLOAT_LITERAL; } else { retreat(); tokenKind = TokenKind.DECIMAL_LITERAL; } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } } else { // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]); // DECIMAL_LITERL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?; // ExponentPart: [eE] [+-]? Digits; if (ch == '.') { scanDigit(); if (ch == 'e' || ch == 'E') { scanExp(); } tokenKind = TokenKind.FLOAT_LITERAL; } else { scanDigit(true); if (ch == '.') { scanFraction(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'e' || ch == 'E') { scanExp(); tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') { tokenKind = TokenKind.FLOAT_LITERAL; } else if (ch == 'l' || ch == 'L') { tokenKind = TokenKind.DECIMAL_LITERAL; } else { retreat(); tokenKind = TokenKind.DECIMAL_LITERAL; } } String lexeme = source.substring(start, pos); return new Token(tokenKind, lexeme); } } private Token scanIdentifier() { int start = pos - 1; do { advance(); } while (isIdentifierChar(ch)); String lexeme = source.substring(start, pos); retreat(); if (Token.isKeyword(lexeme)) { TokenKind kind = Token.kind(lexeme); return new Token(kind, kind.literal); } else { return new Token(TokenKind.IDENTIFIER, lexeme); } } private void scanHex() { advance(); if (!isHex(ch)) { error("invalid hexadecimal literal"); } do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isHex(ch)) { error("invalid hexadecimal literal"); } } do { advance(); } while (isHex(ch)); } while (ch == '_'); } private void scanBit() { advance(); if (!isBit(ch)) { error("invalid binary literal"); } do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isBit(ch)) { error("invalid binary literal"); } } do { advance(); } while (isBit(ch)); } while (ch == '_'); } private void scanOct() { do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isDigit(ch)) { error("invalid octal literal"); } if (!isOct(ch)) { return; } } do { advance(); } while (isOct(ch)); } while (ch == '_'); } private void scanDigit(boolean hasDigit) { advance(); if (hasDigit) { if ((ch != '_') && !isDigit(ch)) { return; } } else if (!isDigit(ch)) { error("invalid decimal literal"); } do { if (ch == '_') { do { advance(); } while(ch == '_'); if (!isDigit(ch)) { error("invalid decimal literal"); } } do { advance(); } while (isDigit(ch)); } while (ch == '_'); } private void scanDigit() { scanDigit(false); } // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?; private void scanFraction() { if (isDigit(peek())) { scanDigit(); } else { advance(); } if (ch == 'e' || ch == 'E') { char c = peek(); if (c == '+' || c == '-') { advance(); } scanDigit(); } if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { retreat(); } } // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?; private void scanHexFraction(boolean hasDigit) { if (hasDigit) { if (isHex(peek())) { scanHex(); } else { advance(); } } else { scanHex(); } if ((ch != 'p') && (ch != 'P')) { error("invalid hexadecimal literal"); } char c = peek(); if (c == '+' || c == '-') { advance(); } scanDigit(); if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { retreat(); } } private void scanExp() { char c = peek(); if (c == '+' || c == '-') { advance(); } scanDigit(); if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) { retreat(); } } private void advance() { if (pos >= source.length()) { ch = CharUtils.EOF; return; } ch = source.charAt(pos++); } private void retreat() { if (ch != EOF) { if (--pos < 0) { error("tokenizer exceed beginning of source"); } } } private char peek() { if (pos >= source.length()) { return EOF; } return source.charAt(pos); } private void error(String msg) { log.error("lexer error: {}", msg); throw new RuntimeException(msg); } } 
    11 replies    2023-03-31 14:45:58 +08:00
    angryPHP
        1
    angryPHP  
       Mar 30, 2023
    有没有测试用例啊
    lucaslee
        2/div>
    lucaslee  
       Mar 30, 2023
    第一印象:满屏的 edfl 字符判断都看晕了 XD
    kwh
        3
    kwh  
       Mar 30, 2023
    chatGPT:让我试试
    neptuno
        4
    neptuno  
       Mar 30, 2023 via iPhone
    好长
    NeoZephyr
        5
    NeoZephyr  
    OP
       Mar 30, 2023
    @angryPHP

    有的

    ```java
    def "scan number without exception"() {
    when:
    Lexer lexer = new Lexer(source)
    Token token = lexer.nextToken()

    then:
    kind == token.kind.name()
    lexeme == token.lexeme

    where:
    source | kind | lexeme

    // BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?;
    "0b0" | "BINARY_LITERAL" | "0b0"
    "0b1" | "BINARY_LITERAL" | "0b1"
    "0b000" | "BINARY_LITERAL" | "0b000"
    "0b010" | "BINARY_LITERAL" | "0b010"
    "0B0L" | "BINARY_LITERAL" | "0B0L"
    "0b10___10l" | "BINARY_LITERAL" | "0b10___10l"
    "0b000___0L" | "BINARY_LITERAL" | "0b000___0L"
    "0b001___0L" | "BINARY_LITERAL" | "0b001___0L"

    // OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?;

    "0000" | "OCT_LITERAL" | "0000"
    "0__007_2" | "OCT_LITERAL" | "0__007_2"
    "0__7_0L" | "OCT_LITERAL" | "0__7_0L"
    "07_0L" | "OCT_LITERAL" | "07_0L"
    "0000l" | "OCT_LITERAL" | "0000l"
    "0007l" | "OCT_LITERAL" | "0007l"
    "0_00_77__0L" | "OCT_LITERAL" | "0_00_77__0L"

    // HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?;

    "0x0" | "HEX_LITERAL" | "0x0"
    "0x000l" | "HEX_LITERAL" | "0x000l"
    "0x0eeeL" | "HEX_LITERAL" | "0x0eeeL"
    "0x000eee" | "HEX_LITERAL" | "0x000eee"
    "0x1_E2E3e5" | "HEX_LITERAL" | "0x1_E2E3e5"
    "0xe___ee_eL" | "HEX_LITERAL" | "0xe___ee_eL"

    // DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?;

    "0L" | "DECIMAL_LITERAL" | "0L"
    "0" | "DECIMAL_LITERAL" | "0"
    "9l" | "DECIMAL_LITERAL" | "9l"
    "99___9L" | "DECIMAL_LITERAL" | "99___9L"
    "9___99___9L" | "DECIMAL_LITERAL" | "9___99___9L"
    "9999" | "DECIMAL_LITERAL" | "9999"
    "1_000_000" | "DECIMAL_LITERAL" | "1_000_000"

    // HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?;

    "0x.fp012" | "HEX_FLOAT_LITERAL" | "0x.fp012"
    "0x.f__f_fp-0012d" | "HEX_FLOAT_LITERAL" | "0x.f__f_fp-0012d"
    "0xf_ffP12d" | "HEX_FLOAT_LITERAL" | "0xf_ffP12d"
    "0xf_f.P-12d" | "HEX_FLOAT_LITERAL" | "0xf_f.P-12d"
    "0xf_f.f_fP+12" | "HEX_FLOAT_LITERAL" | "0xf_f.f_fP+12"
    "0x0.0000000fp-11" | "HEX_FLOAT_LITERAL" | "0x0.0000000fp-11"
    "0xf_ff.P12d" | "HEX_FLOAT_LITERAL" | "0xf_ff.P12d"
    "0X0P0f" | "HEX_FLOAT_LITERAL" | "0X0P0f"
    "0X0P0" | "HEX_FLOAT_LITERAL" | "0X0P0"
    "0X0_0__123P0f" | "HEX_FLOAT_LITERAL" | "0X0_0__123P0f"
    "0XeP0f" | "HEX_FLOAT_LITERAL" | "0XeP0f"
    "0X000.P0f" | "HEX_FLOAT_LITERAL" | "0X000.P0f"
    "0X00e.P0f" | "HEX_FLOAT_LITERAL" | "0X00e.P0f"
    "0X0e__0.0P0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0P0f"
    "0X0e__0.0__0P0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__0P0f"
    "0X0e__0.0__e0P-0__0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__e0P-0__0f"
    "0X0e__0.0__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0e__0.0__e0P+0_1__0f"
    "0X0.0__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0.0__e0P+0_1__0f"
    "0X0.00P0f" | "HEX_FLOAT_LITERAL" | "0X0.00P0f"
    "0X0.0eP0f" | "HEX_FLOAT_LITERAL" | "0X0.0eP0f"
    "0X0.e__00P0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00P0f"
    "0X0.e__00__0P0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00__0P0f"
    "0X0.e__00__e0P-0__0f" | "HEX_FLOAT_LITERAL" | "0X0.e__00__e0P-0__0f"
    "0X0e.0__00__e0P+0_1__0f" | "HEX_FLOAT_LITERAL" | "0X0e.0__00__e0P+0_1__0f"
    "0X.0__00__e0P-0_1__0F" | "HEX_FLOAT_LITERAL" | "0X.0__00__e0P-0_1__0F"
    "0X.0__00__e0P-0_1__0" | "HEX_FLOAT_LITERAL" | "0X.0__00__e0P-0_1__0"

    // FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
    // FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
    // ExponentPart: [eE] [+-]? Digits

    "0f" | "FLOAT_LITERAL" | "0f"
    "00f" | "FLOAT_LITERAL" | "00f"
    "0__0_0f" | "FLOAT_LITERAL" | "0__0_0f"
    "0001f" | "FLOAT_LITERAL" | "0001f"
    "0e0f" | "FLOAT_LITERAL" | "0e0f"
    "0e0" | "FLOAT_LITERAL" | "0e0"
    "1e1" | "FLOAT_LITERAL" | "1e1"
    "0_0e-0_0f" | "FLOAT_LITERAL" | "0_0e-0_0f"
    "0_0e0_120f" | "FLOAT_LITERAL" | "0_0e0_120f"
    "0001e0009" | "FLOAT_LITERAL" | "0001e0009"
    "000012345e1" | "FLOAT_LITERAL" | "000012345e1"
    "0_00___9900d" | "FLOAT_LITERAL" | "0_00___9900d"
    ".0" | "FLOAT_LITERAL" | ".0"
    ".0e0" | "FLOAT_LITERAL" | ".0e0"
    ".0_000" | "FLOAT_LITERAL" | ".0_000"
    ".0___0990" | "FLOAT_LITERAL" | ".0___0990"
    ".000e0__0__0" | "FLOAT_LITERAL" | ".000e0__0__0"
    ".000e0__0__0f" | "FLOAT_LITERAL" | ".000e0__0__0f"
    ".000e-0__9__0f" | "FLOAT_LITERAL" | ".000e-0__9__0f"
    ".9e-0__1_0f" | "FLOAT_LITERAL" | ".9e-0__1_0f"
    "0__00." | "FLOAT_LITERAL" | "0__00."
    "0__090." | "FLOAT_LITERAL" | "0__090."
    "99__9." | "FLOAT_LITERAL" | "99__9."
    "000.000" | "FLOAT_LITERAL" | "000.000"
    "0__10.090" | "FLOAT_LITERAL" | "0__10.090"
    "000__1_0.090" | "FLOAT_LITERAL" | "000__1_0.090"
    "000__1_0.090e12" | "FLOAT_LITERAL" | "000__1_0.090e12"
    "0__10.090e1__00" | "FLOAT_LITERAL" | "0__10.090e1__00"
    "3.1415926" | "FLOAT_LITERAL" | "3.1415926"
    "0.030" | "FLOAT_LITERAL" | "0.030"
    "0.6" | "FLOAT_LITERAL" | "0.6"
    ".0f" | "FLOAT_LITERAL" | ".0f"
    ".0_000f" | "FLOAT_LITERAL" | ".0_000f"
    ".0___0990f" | "FLOAT_LITERAL" | ".0___0990f"
    ".000e0__0__0f" | "FLOAT_LITERAL" | ".000e0__0__0f"
    "0__00.f" | "FLOAT_LITERAL" | "0__00.f"
    "0__090.f" | "FLOAT_LITERAL" | "0__090.f"
    "99__9.f" | "FLOAT_LITERAL" | "99__9.f"
    "000.000f" | "FLOAT_LITERAL" | "000.000f"
    "0__10.090f" | "FLOAT_LITERAL" | "0__10.090f"
    "000__1_0.090f" | "FLOAT_LITERAL" | "000__1_0.090f"
    "000__1_0.090e12f" | "FLOAT_LITERAL" | "000__1_0.090e12f"
    "0__10.090e1__00f" | "FLOAT_LITERAL" | "0__10.090e1__00f"
    "3.1415926f" | "FLOAT_LITERAL" | "3.1415926f"
    "0.030f" | "FLOAT_LITERAL" | "0.030f"
    "0.6f" | "FLOAT_LITERAL" | "0.6f"
    }
    ```
    NeoZephyr
        6
    NeoZephyr  
    OP
       Mar 30, 2023
    @kwh 我问了,他给的建议,感觉不大实用
    NeoZephyr
        7
    NeoZephyr  
    OP
       Mar 30, 2023
    @lucaslee

    手写的,我也没有想到比较好的办法
    lucaslee
        8
    lucaslee  
       Mar 30, 2023
    才看到注释,写那么多代码是在翻译正则?
    Leviathann
        9
    Leviathann  
       Mar 30, 2023
    见不得这种副作用满天飞的代码
    popvlovs
        10
    popvlovs  
       Mar 31, 2023
    看的我眼花 o_o ....
    我只用过 antlr ,从来没考虑过自己手搓 lexer ,这是要拿来练手么?
    NeoZephyr
        11
    NeoZephyr  
    OP
       Mar 31, 2023
    @popvlovs

    我也想知道不用 antlr 这种工具,有啥比较好的办法实现 lexer
    About     Help     Advertise     Blog     API     FAQ     Solana     5415 Online   Highest 6679       Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 68ms UTC 07:44 PVG 15:44 LAX 00:44 JFK 03:44
    Do have faith in what you're doing.
    ubao msn snddm index pchome yahoo rakuten mypaper meadowduck bidyahoo youbao zxmzxm asda bnvcg cvbfg dfscv mmhjk xxddc yybgb zznbn ccubao uaitu acv GXCV ET GDG YH FG BCVB FJFH CBRE CBC GDG ET54 WRWR RWER WREW WRWER RWER SDG EW SF DSFSF fbbs ubao fhd dfg ewr dg df ewwr ewwr et ruyut utut dfg fgd gdfgt etg dfgt dfgd ert4 gd fgg wr 235 wer3 we vsdf sdf gdf ert xcv sdf rwer hfd dfg cvb rwf afb dfh jgh bmn lgh rty gfds cxv xcv xcs vdas fdf fgd cv sdf tert sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf shasha9178 shasha9178 shasha9178 shasha9178 shasha9178 liflif2 liflif2 liflif2 liflif2 liflif2 liblib3 liblib3 liblib3 liblib3 liblib3 zhazha444 zhazha444 zhazha444 zhazha444 zhazha444 dende5 dende denden denden2 denden21 fenfen9 fenf619 fen619 fenfe9 fe619 sdf sdf sdf sdf sdf zhazh90 zhazh0 zhaa50 zha90 zh590 zho zhoz zhozh zhozho zhozho2 lislis lls95 lili95 lils5 liss9 sdf0ty987 sdft876 sdft9876 sdf09876 sd0t9876 sdf0ty98 sdf0976 sdf0ty986 sdf0ty96 sdf0t76 sdf0876 df0ty98 sf0t876 sd0ty76 sdy76 sdf76 sdf0t76 sdf0ty9 sdf0ty98 sdf0ty987 sdf0ty98 sdf6676 sdf876 sd876 sd876 sdf6 sdf6 sdf9876 sdf0t sdf06 sdf0ty9776 sdf0ty9776 sdf0ty76 sdf8876 sdf0t sd6 sdf06 s688876 sd688 sdf86