From 4b6635796c1573f0530f7e6412a3df278d37ad65 Mon Sep 17 00:00:00 2001 From: jackfiled Date: Thu, 18 Apr 2024 16:34:32 +0800 Subject: [PATCH] =?UTF-8?q?refeat:=20ILexer=E6=8E=A5=E5=8F=A3=E9=80=82?= =?UTF-8?q?=E9=85=8D=20(#38)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Huaps <1183155719@qq.com> Co-authored-by: duqoo <92306417+duqoo@users.noreply.github.com> Reviewed-on: https://git.rrricardo.top/PostGuard/Canon/pulls/38 --- Canon.Core/Abstractions/ISourceReader.cs | 26 +- Canon.Core/Enums/SemanticEnums.cs | 10 +- Canon.Core/LexicalParser/LexRules.cs | 83 ++ Canon.Core/LexicalParser/LexemeFactory.cs | 95 ++ Canon.Core/LexicalParser/Lexer.cs | 961 ++++++++---------- Canon.Core/LexicalParser/SemanticToken.cs | 14 - Canon.Tests/CCodeGeneratorTests/BasicTests.cs | 6 +- .../GrammarParserTests/PascalGrammarTests.cs | 21 +- .../LexicalParserTests/CharacterTypeTests.cs | 15 +- .../LexicalParserTests/DelimiterTests.cs | 9 +- .../LexicalParserTests/ErrorSingleTests.cs | 7 +- .../IndentifierTypeTests.cs | 10 +- .../LexicalParserTests/KeywordTypeTests.cs | 8 +- .../LexicalParserTests/LexicalFileTests.cs | 187 +++- Canon.Tests/LexicalParserTests/NumberTests.cs | 13 +- .../LexicalParserTests/OperatorTypeTests.cs | 9 +- Canon.Tests/Utils/EnumerableExtensions.cs | 15 + Canon.Tests/Utils/StringSourceReader.cs | 74 +- Canon.Tests/Utils/StringSourceReaderTests.cs | 110 +- 19 files changed, 952 insertions(+), 721 deletions(-) create mode 100644 Canon.Core/LexicalParser/LexRules.cs create mode 100644 Canon.Core/LexicalParser/LexemeFactory.cs create mode 100644 Canon.Tests/Utils/EnumerableExtensions.cs diff --git a/Canon.Core/Abstractions/ISourceReader.cs b/Canon.Core/Abstractions/ISourceReader.cs index 11336cb..7d3a04d 100644 --- a/Canon.Core/Abstractions/ISourceReader.cs +++ b/Canon.Core/Abstractions/ISourceReader.cs @@ -7,12 +7,7 @@ namespace Canon.Core.Abstractions; /// public interface ISourceReader { - /// - /// 尝试读取下一个字符 - /// - /// 读取到的字符 - /// 是否成功读取 - public bool TryReadChar([NotNullWhen(true)] out char? c); + public char Current { get; } /// /// 源文件名称 @@ -28,4 +23,23 @@ public interface ISourceReader /// 当前读取字符的列号 /// public uint Pos { get; } + + /// + /// 回退一个字符 + /// + /// 回退是否成功 + public bool Retract(); + + /// + /// 前进一个字符 + /// + /// + public bool MoveNext(); + + /// + /// 读取下一个字符但是移进 + /// + /// 读取到的下一个字符 + /// 是否能够读取下一个字符 + public bool TryPeekChar([NotNullWhen(true)] out char? c); } diff --git a/Canon.Core/Enums/SemanticEnums.cs b/Canon.Core/Enums/SemanticEnums.cs index 13be0ec..05d0087 100644 --- a/Canon.Core/Enums/SemanticEnums.cs +++ b/Canon.Core/Enums/SemanticEnums.cs @@ -9,7 +9,6 @@ public enum SemanticTokenType Identifier, Character, Empty, - Error, // 加了一个错误token /// /// 语法分析中的栈底符号 /// @@ -90,10 +89,15 @@ public enum NumberType public enum StateType { + Start, + Comment, Word, - Digit, + Num, Delimiter, - Operator + Operator, + BreakPoint, + Unknown, + Done } public enum BasicIdType diff --git a/Canon.Core/LexicalParser/LexRules.cs b/Canon.Core/LexicalParser/LexRules.cs new file mode 100644 index 0000000..2746679 --- /dev/null +++ b/Canon.Core/LexicalParser/LexRules.cs @@ -0,0 +1,83 @@ +namespace Canon.Core.LexicalParser; + +public static class LexRules +{ + // 保留关键字 + private static readonly string[] _keywords = + [ + "Program", "Const", "Var", "Procedure", + "Function", "Begin", "End", "Array", + "Of", "If", "Then", "Else", + "For", "To", "Do", "Integer", + "Real", "Boolean", "Character", "Divide", + "Not", "Mod", "And", "Or" + ]; + + private static readonly string[] _delimiter = [";", ",", ":", ".", "(", ")", "[", "]", "'", "\"", ".."]; + + private static readonly string[] _operator = ["=", "<>", "<", "<=", ">", ">=", "+", "-", "*", "/", ":="]; + + // 判断字符 + public static bool IsDigit(char _ch) { + if (_ch >= '0' && _ch <= '9') return true; + return false; + } + + public static bool IsHexDigit(char _ch) + { + if ((_ch >= '0' && _ch <= '9') || (_ch<= 'F' && _ch >= 'A')) return true; + return false; + } + + public static bool IsLetter(char _ch) { + if ((_ch >= 'A' && _ch <= 'Z') || (_ch >= 'a' && _ch <= 'z' || _ch == '_')) { + return true; + } + return false; + } + + public static bool IsKeyword(string tokenString) + { + + foreach (var t in _keywords) + { + if (string.Equals(tokenString, t, StringComparison.OrdinalIgnoreCase)) return true; + } + return false; + } + + + public static bool IsDelimiter(char ch) + { + foreach (var delimiter in _delimiter) + { + if (delimiter.Contains(ch)) + { + return true; + } + } + return false; + } + + public static bool IsOperator(char ch) + { + foreach (var o in _operator) + { + if (o.Contains(ch)) + { + return true; + } + } + return false; + } + + public static bool IsBreakPoint(char ch) + { + if (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r') + { + return true; + } + + return false; + } +} diff --git a/Canon.Core/LexicalParser/LexemeFactory.cs b/Canon.Core/LexicalParser/LexemeFactory.cs new file mode 100644 index 0000000..c612dcd --- /dev/null +++ b/Canon.Core/LexicalParser/LexemeFactory.cs @@ -0,0 +1,95 @@ +using Canon.Core.Enums; + +namespace Canon.Core.LexicalParser; + +public static class LexemeFactory +{ + + public static SemanticToken MakeToken(SemanticTokenType tokenType,string literal,uint _line,uint _chPos) + { + SemanticToken? token; + switch (tokenType) + { + case SemanticTokenType.Character: + CharacterSemanticToken characterSemanticToken = new CharacterSemanticToken() + { + LinePos = _line, CharacterPos = _chPos, LiteralValue = literal, + }; + token = characterSemanticToken; + break; + case SemanticTokenType.Identifier: + IdentifierSemanticToken identifierSemanticToken = new IdentifierSemanticToken() + { + LinePos = _line, CharacterPos = _chPos, LiteralValue = literal, + }; + token = identifierSemanticToken; + break; + default: + throw new ArgumentOutOfRangeException(nameof(tokenType), tokenType, null); + } + + return token; + + + } + + public static KeywordSemanticToken MakeToken(KeywordType keywordType,string literal,uint _line,uint _chPos) + { + KeywordSemanticToken keywordSemanticToken = new KeywordSemanticToken + { + LinePos = _line, + CharacterPos = _chPos, + LiteralValue = literal, + KeywordType = keywordType + }; + return keywordSemanticToken; + } + + public static DelimiterSemanticToken MakeToken(DelimiterType delimiterType,string literal,uint _line,uint _chPos) + { + DelimiterSemanticToken delimiterSemanticToken = new DelimiterSemanticToken() + { + LinePos = _line, + CharacterPos = _chPos, + LiteralValue = literal, + DelimiterType = delimiterType + }; + return delimiterSemanticToken; + } + + public static NumberSemanticToken MakeToken(NumberType numberType,string literal,uint _line,uint _chPos) + { + string temp = literal; + string result; + if (numberType == NumberType.Hex) + { + result = string.Concat("0x", temp.AsSpan(1, temp.Length - 1)); + } + else + { + result = temp; + } + + NumberSemanticToken numberSemanticToken = new NumberSemanticToken() + { + LinePos = _line, + CharacterPos = _chPos, + LiteralValue = result, + NumberType = numberType + }; + return numberSemanticToken; + + } + + public static OperatorSemanticToken MakeToken(OperatorType operatorType,string literal,uint _line,uint _chPos) + { + OperatorSemanticToken operatorSemanticToken = new OperatorSemanticToken() + { + LinePos = _line, + CharacterPos = _chPos, + LiteralValue = literal, + OperatorType = operatorType + }; + return operatorSemanticToken; + } +} diff --git a/Canon.Core/LexicalParser/Lexer.cs b/Canon.Core/LexicalParser/Lexer.cs index abab562..edcd595 100644 --- a/Canon.Core/LexicalParser/Lexer.cs +++ b/Canon.Core/LexicalParser/Lexer.cs @@ -1,50 +1,29 @@ -using System.Numerics; using System.Text; +using Canon.Core.Abstractions; using Canon.Core.Enums; using Canon.Core.Exceptions; namespace Canon.Core.LexicalParser; -public class Lexer(string source) +public class Lexer : ILexer { - - // 保留关键字 - private readonly string[] _keywords = - [ - "Program", "Const", "Var", "Procedure", - "Function", "Begin", "End", "Array", - "Of", "If", "Then", "Else", - "For", "To", "Do", "Integer", - "Real", "Boolean", "Character", "Divide", - "Not", "Mod", "And", "Or" - ]; - - private readonly string[] _delimiter = [";", ",", ":", ".", "(", ")", "[", "]", "'", "\"", ".."]; - - private readonly string[] _operator = ["=", "<>", "<", "<=", ">", ">=", "+", "-", "*", "/", ":="]; + // 记录token + private SemanticToken? _semanticToken; + private readonly StringBuilder _tokenBuilder = new(); + private readonly List _tokens = []; // 状态机 - private StateType _state; + private StateType _state = StateType.Start; private char _ch; + private bool _finish; - private LinkedList _token = new LinkedList(); - - // bool save; - // int saved_state; - bool _finish; - - - //缓冲区 - private readonly char[] _buffer = new char[2048]; - - // int start_pos; - private int _fwdPos; - - // 计数器 + // 文件读取 + private ISourceReader _reader; private uint _line = 1; private uint _chPos; - private readonly Dictionary _tokenCount = new Dictionary + // Token统计信息 + private readonly Dictionary _tokenCount = new() { { SemanticTokenType.Keyword, 0 }, { SemanticTokenType.Number, 0 }, @@ -52,264 +31,244 @@ public class Lexer(string source) { SemanticTokenType.Delimiter, 0 }, { SemanticTokenType.Identifier, 0 }, { SemanticTokenType.Character, 0 }, - { SemanticTokenType.Error, 0 }, { SemanticTokenType.End, 0 } }; - private readonly List _tokens = []; - - public List Tokenize() + public IEnumerable Tokenize(ISourceReader reader) { - // 缓冲区 - // start_pos = 0; - _fwdPos = 0; + _reader = reader; - // 状态机 - _finish = false; - - while (!_finish) + while (_state != StateType.Done) { - GetChar(); - GetNbc(); - if (_finish) break; + switch (_state) + { + case StateType.Start: + HandleStartState(); + break; + case StateType.Comment: + if (_ch == '{') + { + HandleCommentStateBig(); + } + else if (_ch == '*') + { + HandleCommentStateSmall(); + } + else + { + HandleCommentSingleLine(); + } - _token = new LinkedList(); + break; + case StateType.Num: + HandleNumState(); + break; + case StateType.Word: + HandleWordState(); + break; + case StateType.Delimiter: + HandleDelimiterState(); + break; + case StateType.Operator: + HandleOperatorState(); + break; + case StateType.BreakPoint: + while (LexRules.IsBreakPoint(_ch)) + { + GetChar(); + } - if (IsLetter()) - { - _state = StateType.Word; + Retract(); + _state = StateType.Start; + break; + case StateType.Unknown: + throw new LexemeException(LexemeErrorType.UnknownCharacterOrString, _line, _chPos, + "Illegal lexeme."); + case StateType.Done: + break; } - else if(_ch == '.') + } + + _tokens.Add(SemanticToken.End); + return _tokens; + } + + private void HandleStartState() + { + // 初始化 + ResetTokenBuilder(); + + // 读取首个字符 + GetChar(); + + if (_finish) + { + _state = StateType.Done; + return; + } + + // 根据首个字符判断可能的情况 + if (_ch == '{') // 以 “{” 开头,为注释 + { + _state = StateType.Comment; + } + else if (_ch == '(') + { + char nextChar = PeekNextChar(); + if (nextChar == '*') { - char next = PeekNextChar(); - if (next >= '0' && next <= '9') - { - _state = StateType.Digit; - } - else - { - _state = StateType.Delimiter; - } + GetChar(); + _state = StateType.Comment; } - else if (IsDigit() || _ch == '$') - { - _state = StateType.Digit; - } - else if (IsDelimiter()) + else { _state = StateType.Delimiter; } - else if (_ch == '{') + } + else if (_ch == '/') + { + char nextChar = PeekNextChar(); + if (nextChar == '/') { - while (_ch != '}') - { - GetChar(); - if (_ch == '\n') - { - _line++; - _chPos = 0; - } - if (_finish) - { - throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, "The comment is not closed."); - } - - } - - continue; + GetChar(); + _state = StateType.Comment; } else { _state = StateType.Operator; } - - switch (_state) - { - case StateType.Word: - while (IsDigit() || IsLetter()) - { - Cat(); - GetChar(); - } - - Retract(); - - if (IsKeyword()) - { - KeywordType keywordType = - KeywordSemanticToken.GetKeywordTypeByKeyword(LinkedListToString(_token.First)); - MakeToken(keywordType); - } - else - { - MakeToken(SemanticTokenType.Identifier); - } - - break; - case StateType.Digit: - DealNumber(); - break; - case StateType.Delimiter: - Cat(); - switch (_ch) - { - case '.': - { - GetChar(); - if (_ch == '.') - { - Cat(); - MakeToken(DelimiterType.DoubleDots); - break; - } - - Retract(); - if (IsDot()) - { - MakeToken(DelimiterType.Dot); - } - else - { - MakeToken(DelimiterType.Period); - } - } - break; - case '\'': - case '\"': - { - // 重置_token,准备收集字符串内容 - _token = new LinkedList(); - - GetChar(); // 移动到下一个字符,即字符串的第一个字符 - while (_ch != '\'' && _ch != '\"') - { - Cat(); // 收集字符 - GetChar(); // 移动到下一个字符 - if (_ch == '\n' || _finish) - { - throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos, "The String is not closed."); - } - } - - MakeToken(SemanticTokenType.Character); // 或其它适用于字符串字面量的SemanticTokenType - _token = new LinkedList(); // 重置_token - - if (!(_ch == '\'' || _ch == '\"')) - { - throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos, "The String is not closed."); - } - } - break; - case ',': - MakeToken(DelimiterType.Comma); - break; - case ':': - char nextChar = PeekNextChar(); - if (nextChar == '=') - { - GetChar(); - Cat(); - MakeToken(OperatorType.Assign); - } - else - { - MakeToken(DelimiterType.Colon); - } - - break; - case ';': - MakeToken(DelimiterType.Semicolon); - break; - case '(': - char next = PeekNextChar(); - if (next == '*') - { - GetChar(); - bool commentClosed = false; - while (!commentClosed) - { - GetNbc(); - GetChar(); - while (_ch != '*') - { - GetNbc(); - GetChar(); - if (_finish) - { - throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, "The comment is not closed."); - } - } - - GetChar(); - if (_finish) - { - throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, "The comment is not closed."); - } - - if (_ch == ')') commentClosed = true; - } - } - else - { - MakeToken(DelimiterType.LeftParenthesis); - } - - break; - case ')': - MakeToken(DelimiterType.RightParenthesis); - break; - case '[': - MakeToken(DelimiterType.LeftSquareBracket); - break; - case ']': - MakeToken(DelimiterType.RightSquareBracket); - break; - } - - break; - case StateType.Operator: - DealOther(); - break; - default: - throw new ArgumentOutOfRangeException(); - } - } - - return _tokens; + else if (_ch == '.') // 以 “.” 开头,可能是数字或分隔符 + { + char next = PeekNextChar(); + if (next is >= '0' and <= '9') + { + _state = StateType.Num; + } + else + { + _state = StateType.Delimiter; + } + } + else if (LexRules.IsLetter(_ch)) // 以字母开头,为关键字或标识符 + { + _state = StateType.Word; + } + else if (LexRules.IsDigit(_ch) || _ch == '$') // 以数字或 “$” 开头,为数值 + { + _state = StateType.Num; + } + else if (LexRules.IsDelimiter(_ch)) // 为分隔符 + { + _state = StateType.Delimiter; + } + else if (LexRules.IsOperator(_ch)) // 为运算符 + { + _state = StateType.Operator; + } + else if (LexRules.IsBreakPoint(_ch)) + { + _state = StateType.BreakPoint; + } + else + { + _state = StateType.Unknown; + } } - private void DealNumber() + private void HandleCommentStateBig() { + while (_ch != '}') + { + GetChar(); + + if (_finish) + { + throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, + "The comment is not closed."); + } + } + + _state = StateType.Start; + } + + private void HandleCommentStateSmall() + { + bool commentClosed = false; + while (!commentClosed) + { + GetChar(); + while (_ch != '*') + { + GetChar(); + if (_finish) + { + throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, + "The comment is not closed."); + } + } + + GetChar(); + if (_finish) + { + throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, + "The comment is not closed."); + } + + if (_ch == ')') commentClosed = true; + } + + _state = StateType.Start; + } + + private void HandleCommentSingleLine() + { + while (_ch != '\n') + { + GetChar(); + } + + _state = StateType.Start; + } + + private void HandleWordState() + { + while (LexRules.IsDigit(_ch) || LexRules.IsLetter(_ch)) + { + Cat(); + GetChar(); + } + + Retract(); + + string tokenString = GetCurrentTokenString(); + if (LexRules.IsKeyword(tokenString)) + { + KeywordType keywordType = + KeywordSemanticToken.GetKeywordTypeByKeyword(GetCurrentTokenString()); + + _semanticToken = LexemeFactory.MakeToken(keywordType, tokenString, _line, _chPos); + } + else + { + _semanticToken = LexemeFactory.MakeToken(SemanticTokenType.Identifier, tokenString, _line, _chPos); + } + + AddToTokens(_semanticToken); + _state = StateType.Start; + } + + private void HandleNumState() + { + NumberType numberType = NumberType.Integer; // 十六进制 if (_ch == '$') { - Cat(); - - GetChar(); - while (!NumberShouldBreak()) - { - // 假设IsHexDigit方法能够识别十六进制数字 - if (IsHexDigit()) - { - Cat(); - GetChar(); - } - else if(NumberShouldBreak()) - { - break; - } - else - { - throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal hex numbers!"); - } - } - MakeToken(NumberType.Hex); - return; + ProcessHex(); + numberType = NumberType.Hex; } - // 非十六进制 - if(IsDigit() || _ch == '.') + else if (LexRules.IsDigit(_ch) || _ch == '.') { while (!NumberShouldBreak()) { @@ -321,61 +280,66 @@ public class Lexer(string source) if (next == '.') { Retract(); + _state = StateType.Delimiter; break; } // 不是符号 “..”,进入小数点后的判断 - Cat(); // 记录“.” + Cat(); // 记录“.” // “.”后不应为空,至少应该有一位小数 GetChar(); if (NumberShouldBreak()) { - throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal numbers!"); + throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, + "Illegal numbers!"); } // 读取小数点后的数字 while (!NumberShouldBreak()) { - if (IsDigit()) + if (LexRules.IsDigit(_ch)) { Cat(); GetChar(); } else if (_ch == 'e' || _ch == 'E') { - DealE(); + ProcessE(); break; } - else if(NumberShouldBreak()) + else if (NumberShouldBreak()) { break; } else { - throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal number."); + throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, + "Illegal number."); } } - MakeToken(NumberType.Real); - return; + + numberType = NumberType.Real; + break; } // 不含小数部分,含科学计数法 if (_ch == 'e' || _ch == 'E') { - DealE(); - MakeToken(NumberType.Real); - return; + ProcessE(); + numberType = NumberType.Real; + break; } // 暂时为整数 - if (IsDigit()) + if (LexRules.IsDigit(_ch)) { Cat(); GetChar(); } - else if(NumberShouldBreak()) + else if (NumberShouldBreak()) { + numberType = NumberType.Integer; break; } else @@ -383,16 +347,44 @@ public class Lexer(string source) throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal number."); } } - MakeToken(NumberType.Integer); } + _semanticToken = LexemeFactory.MakeToken(numberType, GetCurrentTokenString(), + _line, _chPos); + AddToTokens(_semanticToken); + _state = StateType.Start; } - private void DealE() + private void ProcessHex() { Cat(); GetChar(); - if (IsDigit() || _ch == '+' || _ch == '-') + + while (!NumberShouldBreak()) + { + // 假设IsHexDigit方法能够识别十六进制数字 + if (LexRules.IsHexDigit(_ch)) + { + Cat(); + GetChar(); + } + else if (NumberShouldBreak()) + { + break; + } + else + { + throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, + "Illegal hex numbers!"); + } + } + } + + private void ProcessE() + { + Cat(); + GetChar(); + if (LexRules.IsDigit(_ch) || _ch == '+' || _ch == '-') { Cat(); } @@ -405,7 +397,7 @@ public class Lexer(string source) GetChar(); while (!NumberShouldBreak()) { - if (IsDigit()) + if (LexRules.IsDigit(_ch)) { Cat(); GetChar(); @@ -419,7 +411,8 @@ public class Lexer(string source) bool NumberShouldBreak() { - if (_ch == ' ' || _ch == '\n' || _ch == '\t' || _ch == '\r' || (IsDelimiter() && _ch!='.') || IsOperator() || _finish) + if (_ch == ' ' || _ch == '\n' || _ch == '\t' || _ch == '\r' || (LexRules.IsDelimiter(_ch) && _ch != '.') || + LexRules.IsOperator(_ch) || _finish) { Retract(); return true; @@ -428,18 +421,6 @@ public class Lexer(string source) return false; } - private bool IsOperator() - { - foreach (var o in _operator) - { - if (o.Contains(_ch)) - { - return true; - } - } - return false; - } - private bool IsDot() { if (_tokens.Count != 0) @@ -447,33 +428,136 @@ public class Lexer(string source) SemanticToken tokenBefore = _tokens.Last(); if (tokenBefore.TokenType == SemanticTokenType.Identifier) return true; } + return false; } + private void HandleDelimiterState() + { + Cat(); + switch (_ch) + { + case '.': + { + GetChar(); + if (_ch == '.') + { + Cat(); + _semanticToken = LexemeFactory.MakeToken(DelimiterType.DoubleDots, "..", _line, _chPos); + break; + } - private void DealOther() + Retract(); + if (IsDot()) + { + _semanticToken = LexemeFactory.MakeToken(DelimiterType.Dot, ".", _line, _chPos); + } + else + { + _semanticToken = LexemeFactory.MakeToken(DelimiterType.Period, ".", _line, _chPos); + } + } + break; + case '\'': + case '\"': + { + // 重置_token,准备收集字符串内容 + ResetTokenBuilder(); + + GetChar(); // 移动到下一个字符,即字符串的第一个字符 + while (_ch != '\'' && _ch != '\"') + { + Cat(); // 收集字符 + GetChar(); // 移动到下一个字符 + if (_ch == '\n' || _finish) + { + throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos, + "The String is not closed."); + } + } + + _semanticToken = LexemeFactory.MakeToken(SemanticTokenType.Character, + GetCurrentTokenString(), _line, _chPos); + + ResetTokenBuilder(); + + if (!(_ch == '\'' || _ch == '\"')) + { + throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos, + "The String is not closed."); + } + } + break; + case ',': + _semanticToken = LexemeFactory.MakeToken(DelimiterType.Comma, ",", _line, _chPos); + + break; + case ':': + char nextChar = PeekNextChar(); + if (nextChar == '=') + { + GetChar(); + Cat(); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Assign, ":=", _line, _chPos); + } + else + { + _semanticToken = LexemeFactory.MakeToken(DelimiterType.Colon, ":", _line, _chPos); + } + + break; + case ';': + _semanticToken = LexemeFactory.MakeToken(DelimiterType.Semicolon, ";", _line, _chPos); + + break; + case '(': + _semanticToken = LexemeFactory.MakeToken(DelimiterType.LeftParenthesis, "(", _line, _chPos); + break; + case ')': + _semanticToken = LexemeFactory.MakeToken(DelimiterType.RightParenthesis, ")", _line, _chPos); + + break; + case '[': + _semanticToken = LexemeFactory.MakeToken(DelimiterType.LeftSquareBracket, "[", _line, _chPos); + + break; + case ']': + _semanticToken = LexemeFactory.MakeToken(DelimiterType.RightSquareBracket, "]", _line, _chPos); + break; + } + + AddToTokens(_semanticToken); + _state = StateType.Start; + } + + private void HandleOperatorState() { switch (_ch) { case '+': // 识别 + Cat(); - MakeToken(OperatorType.Plus); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Plus, "+", _line, _chPos); + AddToTokens(_semanticToken); break; case '-': // 识别 - Cat(); - MakeToken(OperatorType.Minus); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Minus, "-", _line, _chPos); + AddToTokens(_semanticToken); break; case '*': // 识别 * Cat(); - MakeToken(OperatorType.Multiply); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Multiply, "*", _line, _chPos); + AddToTokens(_semanticToken); break; case '/': // 识别 / Cat(); - MakeToken(OperatorType.Divide); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Divide, "/", _line, _chPos); + AddToTokens(_semanticToken); break; case '=': Cat(); - MakeToken(OperatorType.Equal); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Equal, "=", _line, _chPos); + AddToTokens(_semanticToken); break; case '<': Cat(); @@ -482,20 +566,24 @@ public class Lexer(string source) { // 识别 <= Cat(); - MakeToken(OperatorType.LessEqual); + _semanticToken = LexemeFactory.MakeToken(OperatorType.LessEqual, "<=", _line, _chPos); + AddToTokens(_semanticToken); } - else if(_ch == '>') + else if (_ch == '>') { // 识别 <> Cat(); - MakeToken(OperatorType.NotEqual); + _semanticToken = LexemeFactory.MakeToken(OperatorType.NotEqual, ">", _line, _chPos); + AddToTokens(_semanticToken); } else { // 识别 < Retract(); - MakeToken(OperatorType.Less); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Less, "<", _line, _chPos); + AddToTokens(_semanticToken); } + break; case '>': Cat(); @@ -504,270 +592,83 @@ public class Lexer(string source) { // 识别 >= Cat(); - MakeToken(OperatorType.GreaterEqual); + _semanticToken = LexemeFactory.MakeToken(OperatorType.GreaterEqual, ">=", _line, _chPos); + AddToTokens(_semanticToken); } else { // 识别 > Retract(); - MakeToken(OperatorType.Greater); + _semanticToken = LexemeFactory.MakeToken(OperatorType.Greater, ">", _line, _chPos); + AddToTokens(_semanticToken); } + break; default: throw new LexemeException(LexemeErrorType.UnknownCharacterOrString, _line, _chPos, "Illegal lexeme."); } + + _state = StateType.Start; } - private void MakeToken(SemanticTokenType tokenType) + private void AddToTokens(SemanticToken semanticToken) { - SemanticToken? token; - if (_token.First == null) - { - Console.WriteLine("11"); - } - switch (tokenType) - { - case SemanticTokenType.Character: - CharacterSemanticToken characterSemanticToken = new CharacterSemanticToken() - { - LinePos = _line, CharacterPos = _chPos, LiteralValue = LinkedListToString(_token.First), - }; - token = characterSemanticToken; - break; - case SemanticTokenType.Identifier: - IdentifierSemanticToken identifierSemanticToken = new IdentifierSemanticToken() - { - LinePos = _line, CharacterPos = _chPos, LiteralValue = LinkedListToString(_token.First), - }; - token = identifierSemanticToken; - break; - default: - throw new ArgumentOutOfRangeException(nameof(tokenType), tokenType, null); - } - - if (token != null) - { - _tokens.Add(token); - _tokenCount[tokenType]++; - Console.WriteLine($"<{tokenType}>"); - Console.WriteLine(LinkedListToString(_token.First)); - } - - - } - - private void MakeToken(KeywordType keywordType) - { - KeywordSemanticToken keywordSemanticToken = new KeywordSemanticToken - { - LinePos = _line, - CharacterPos = _chPos, - LiteralValue = LinkedListToString(_token.First), - KeywordType = keywordType - }; - _tokens.Add(keywordSemanticToken); - _tokenCount[SemanticTokenType.Keyword]++; - Console.WriteLine($"<{SemanticTokenType.Keyword}> <{keywordType}>"); - Console.WriteLine(LinkedListToString(_token.First)); - } - - private void MakeToken(DelimiterType delimiterType) - { - DelimiterSemanticToken delimiterSemanticToken = new DelimiterSemanticToken() - { - LinePos = _line, - CharacterPos = _chPos, - LiteralValue = LinkedListToString(_token.First), - DelimiterType = delimiterType - }; - _tokens.Add(delimiterSemanticToken); - _tokenCount[SemanticTokenType.Delimiter]++; - Console.WriteLine($"<{SemanticTokenType.Delimiter}> <{delimiterType}>"); - Console.WriteLine(LinkedListToString(_token.First)); - } - - private void MakeToken(NumberType numberType) - { - string temp = LinkedListToString(_token.First); - string result; - if (numberType == NumberType.Hex) - { - result = string.Concat("0x", temp.AsSpan(1, temp.Length - 1)); - } - else - { - result = temp; - } - - NumberSemanticToken numberSemanticToken = new NumberSemanticToken() - { - LinePos = _line, - CharacterPos = _chPos, - LiteralValue = result, - NumberType = numberType - }; - _tokens.Add(numberSemanticToken); - _tokenCount[SemanticTokenType.Number]++; - Console.WriteLine($"<{SemanticTokenType.Number}> <{numberType}>"); - Console.WriteLine(LinkedListToString(_token.First)); - } - - private void MakeToken(OperatorType operatorType) - { - OperatorSemanticToken operatorSemanticToken = new OperatorSemanticToken() - { - LinePos = _line, - CharacterPos = _chPos, - LiteralValue = LinkedListToString(_token.First), - OperatorType = operatorType - }; - _tokens.Add(operatorSemanticToken); - _tokenCount[SemanticTokenType.Operator]++; - Console.WriteLine($"<{SemanticTokenType.Operator}> <{operatorType}>"); - Console.WriteLine(LinkedListToString(_token.First)); - } - - // 读取字符操作 - void GetChar() { - if (_fwdPos >= 0 && _fwdPos < source.Length) - { - _ch = source[_fwdPos]; - _chPos++; - _fwdPos++; - } - else if (_fwdPos == source.Length) - { - _ch = '\0'; - _chPos++; - _finish = true; - } - } - - private void GetNbc() { - while (_ch == ' ' || _ch == '\n' || _ch == '\t' || _ch == '\r') { - if (_ch == '\n') { - _line++; - _chPos = 0; - } - GetChar(); - } - } - - private void Retract() { - _fwdPos -= 2; - _chPos -= 2; - GetChar(); + _tokens.Add(semanticToken); + _tokenCount[semanticToken.TokenType]++; + Console.WriteLine($"<{semanticToken.TokenType}>"); + Console.WriteLine(semanticToken.LiteralValue); } private void Cat() { - _token.AddLast(_ch); - // cout << "加入" << ch << endl; + _tokenBuilder.Append(_ch); // 使用StringBuilder追加字符 } - private string LinkedListToString(LinkedListNode first) + private string GetCurrentTokenString() { - // 使用 StringBuilder 来构建字符串 - StringBuilder sb = new StringBuilder(); - for (LinkedListNode node = first; node != null; node = node.Next) - { - sb.Append(node.Value); - } - - // 将 StringBuilder 的内容转换为字符串 - string result = sb.ToString(); - - return result; + return _tokenBuilder.ToString(); // 从StringBuilder获取当前记号的字符串 } - // 判断字符 - private bool IsDigit() { - if (_ch >= '0' && _ch <= '9') return true; - return false; - } - - private bool IsHexDigit() + private void ResetTokenBuilder() { - if ((_ch >= '0' && _ch <= '9') || (_ch<= 'F' && _ch >= 'A')) return true; - return false; - } - - private bool IsLetter() { - if ((_ch >= 'A' && _ch <= 'Z') || (_ch >= 'a' && _ch <= 'z' || _ch == '_')) { - return true; - } - return false; - } - - private bool IsKeyword() - { - string tokenString = LinkedListToString(_token.First); - - foreach (var t in _keywords) - { - if (string.Equals(tokenString, t, StringComparison.OrdinalIgnoreCase)) return true; - } - return false; - } - - - private bool IsDelimiter() - { - foreach (var delimiter in _delimiter) - { - if (delimiter.Contains(_ch)) - { - return true; - } - } - return false; + _tokenBuilder.Clear(); // 清空StringBuilder以复用 } private char PeekNextChar() { // 确认下一个位置是否仍在buffer的范围内 - if (_fwdPos < source.Length) + if (_reader.TryPeekChar(out char? c)) { - return source[_fwdPos]; + return c.Value; } - return '\0'; - - } - - - - private void PrintToken(SemanticTokenType type, LinkedListNode token, uint line) - { - string tokenString = LinkedListToString(token); - string typeName = Enum.GetName(typeof(SemanticTokenType), type) ?? "Unknown"; - Console.WriteLine($"{line} <{typeName.ToUpperInvariant()},{tokenString}>"); - } - - // PrintToken(SemanticTokenType.Keyword, "if", 42); // 假设'if'是token,42是行号 - - private void PrintError(int type, LinkedListNode token, uint line) - { - string tokenString = LinkedListToString(token); - switch (type) + else { - case 0: - Console.WriteLine($"{line} "); - break; - case 1: - Console.WriteLine($"{line} "); - break; + return char.MinValue; } } - // PrintError(0, "unexpected symbol", 42); // 假设 "unexpected symbol" 是错误的 token,42 是行号 - - private void PrintResult() + void GetChar() { - Console.WriteLine(_line); - foreach (var pair in _tokenCount) + if (_finish) { - Console.WriteLine($"{pair.Key}: {pair.Value}"); + return; } + + _finish = !_reader.MoveNext(); + + if (_finish) + { + _ch = char.MinValue; + return; + } + + _ch = _reader.Current; + _line = _reader.Line; + _chPos = _reader.Pos; + } + + void Retract() + { + _reader.Retract(); } } - diff --git a/Canon.Core/LexicalParser/SemanticToken.cs b/Canon.Core/LexicalParser/SemanticToken.cs index 878630b..bc4b0c9 100644 --- a/Canon.Core/LexicalParser/SemanticToken.cs +++ b/Canon.Core/LexicalParser/SemanticToken.cs @@ -339,18 +339,4 @@ public class EndSemanticToken : SemanticToken public override SemanticTokenType TokenType => SemanticTokenType.End; } -/// -/// 错误类型记号 -/// -public class ErrorSemanticToken : SemanticToken -{ - public override SemanticTokenType TokenType => SemanticTokenType.Error; - - public static bool TryParse(uint linePos, uint characterPos, LinkedListNode now, - out IdentifierSemanticToken? token) - { - token = null; - return false; - } -} diff --git a/Canon.Tests/CCodeGeneratorTests/BasicTests.cs b/Canon.Tests/CCodeGeneratorTests/BasicTests.cs index 61fd5c5..0a291b0 100644 --- a/Canon.Tests/CCodeGeneratorTests/BasicTests.cs +++ b/Canon.Tests/CCodeGeneratorTests/BasicTests.cs @@ -3,12 +3,14 @@ using Canon.Core.CodeGenerators; using Canon.Core.LexicalParser; using Canon.Core.SyntaxNodes; using Canon.Tests.GeneratedParserTests; +using Canon.Tests.Utils; namespace Canon.Tests.CCodeGeneratorTests; public class BasicTests { private readonly IGrammarParser _parser = GeneratedGrammarParser.Instance; + private readonly ILexer _lexer = new Lexer(); [Fact] public void ProgramStructTest() @@ -21,9 +23,7 @@ public class BasicTests end. """; - Lexer lexer = new(program); - List tokens = lexer.Tokenize(); - tokens.Add(SemanticToken.End); + IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program)); ProgramStruct root = _parser.Analyse(tokens); root.GenerateCCode(builder); diff --git a/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs b/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs index 797fe7f..9eb67e0 100644 --- a/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs +++ b/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs @@ -4,12 +4,14 @@ using Canon.Core.GrammarParser; using Canon.Core.LexicalParser; using Canon.Core.SyntaxNodes; using Canon.Tests.GeneratedParserTests; +using Canon.Tests.Utils; namespace Canon.Tests.GrammarParserTests; public class PascalGrammarTests { private readonly IGrammarParser _parser = GeneratedGrammarParser.Instance; + private readonly ILexer _lexer = new Lexer(); [Fact] public void DoNothingTest() @@ -20,9 +22,7 @@ public class PascalGrammarTests end. """; - Lexer lexer = new(program); - List tokens = lexer.Tokenize(); - tokens.Add(SemanticToken.End); + IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program)); ProgramStruct root = _parser.Analyse(tokens); Assert.Equal("DoNothing", root.Head.ProgramName.LiteralValue); @@ -39,10 +39,7 @@ public class PascalGrammarTests a := 1 + 1 end. """; - - Lexer lexer = new(program); - List tokens = lexer.Tokenize(); - tokens.Add(SemanticToken.End); + IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program)); ProgramStruct root = _parser.Analyse(tokens); Assert.Equal("Add", root.Head.ProgramName.LiteralValue); @@ -59,10 +56,7 @@ public class PascalGrammarTests writeln( str, ret ); end. """; - - Lexer lexer = new(program); - List tokens = lexer.Tokenize(); - tokens.Add(SemanticToken.End); + IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program)); ProgramStruct root = _parser.Analyse(tokens); Assert.Equal("exFunction", root.Head.ProgramName.LiteralValue); @@ -79,10 +73,7 @@ public class PascalGrammarTests begin end. """; - - Lexer lexer = new(program); - List tokens = lexer.Tokenize(); - tokens.Add(SemanticToken.End); + IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program)); ProgramStruct root = _parser.Analyse(tokens); Assert.Equal("main", root.Head.ProgramName.LiteralValue); diff --git a/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs b/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs index e37c692..6f20401 100644 --- a/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs +++ b/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs @@ -2,13 +2,15 @@ using Canon.Core.LexicalParser; using Xunit.Abstractions; using Canon.Core.Exceptions; +using Canon.Core.Abstractions; +using Canon.Tests.Utils; namespace Canon.Tests.LexicalParserTests { public class CharacterTypeTests { private readonly ITestOutputHelper _testOutputHelper; - + private readonly ILexer _lexer = new Lexer(); public CharacterTypeTests(ITestOutputHelper testOutputHelper) { _testOutputHelper = testOutputHelper; @@ -20,16 +22,15 @@ namespace Canon.Tests.LexicalParserTests public void TestCharacterType(string input, string? expectedResult) { - Lexer lexer = new(input); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input)); + List tokens = tokensEnumerable.ToList(); if (expectedResult == null) { - Assert.Throws(() => lexer.Tokenize()); + Assert.Throws(() => tokens); } else { - List tokens = lexer.Tokenize(); _testOutputHelper.WriteLine(tokens[0].LiteralValue); - Assert.Single(tokens); Assert.Equal(SemanticTokenType.Character, tokens[0].TokenType); Assert.Equal(expectedResult, tokens[0].LiteralValue); } @@ -43,8 +44,8 @@ namespace Canon.Tests.LexicalParserTests //[InlineData("\"x\'", 1, 3, LexemeException.LexemeErrorType.UnclosedStringLiteral)] public void TestParseCharacterError(string input, uint expectedLine, uint expectedCharPosition, LexemeErrorType expectedErrorType) { - Lexer lexer = new(input); - var ex = Assert.Throws(() => lexer.Tokenize()); + + var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(input)).ToList()); _testOutputHelper.WriteLine(ex.ToString()); Assert.Equal(expectedErrorType, ex.ErrorType); Assert.Equal(expectedLine, ex.Line); diff --git a/Canon.Tests/LexicalParserTests/DelimiterTests.cs b/Canon.Tests/LexicalParserTests/DelimiterTests.cs index 934742f..cb6f620 100644 --- a/Canon.Tests/LexicalParserTests/DelimiterTests.cs +++ b/Canon.Tests/LexicalParserTests/DelimiterTests.cs @@ -1,10 +1,13 @@ using Canon.Core.Enums; using Canon.Core.LexicalParser; - +using Canon.Tests.Utils; +using Canon.Core.Abstractions; namespace Canon.Tests.LexicalParserTests; public class DelimiterTests { + private readonly ILexer _lexer = new Lexer(); + [Theory] [InlineData(",123", DelimiterType.Comma)] // [InlineData(".123", DelimiterType.Period)] @@ -16,8 +19,8 @@ public class DelimiterTests [InlineData("]asd", DelimiterType.RightSquareBracket)] public void SmokeTest(string input, DelimiterType type) { - Lexer lexer = new(input); - List tokens = lexer.Tokenize(); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input)); + List tokens = tokensEnumerable.ToList(); SemanticToken token = tokens[0]; Assert.Equal(SemanticTokenType.Delimiter, token.TokenType); diff --git a/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs b/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs index bb4603a..bbc1a69 100644 --- a/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs +++ b/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs @@ -2,11 +2,14 @@ using Canon.Core.Exceptions; using Xunit.Abstractions; using Canon.Core.Enums; +using Canon.Core.Abstractions; +using Canon.Tests.Utils; namespace Canon.Tests.LexicalParserTests { public class ErrorSingleTests { + private readonly ILexer _lexer = new Lexer(); private readonly ITestOutputHelper _testOutputHelper; public ErrorSingleTests(ITestOutputHelper testOutputHelper) { @@ -20,9 +23,7 @@ namespace Canon.Tests.LexicalParserTests [InlineData("identifier_with_special_chars@#",1, 30, LexemeErrorType.UnknownCharacterOrString)] public void TestUnknownCharacterError(string pascalProgram, uint expectedLine, uint expectedCharPosition, LexemeErrorType expectedErrorType) { - var lexer = new Lexer(pascalProgram); - - var ex = Assert.Throws(() => lexer.Tokenize()); + var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(pascalProgram)).ToList()); _testOutputHelper.WriteLine(ex.ToString()); Assert.Equal(expectedErrorType, ex.ErrorType); Assert.Equal(expectedLine, ex.Line); diff --git a/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs b/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs index d453be9..a6ce635 100644 --- a/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs +++ b/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs @@ -1,10 +1,13 @@ using Canon.Core.Enums; using Canon.Core.LexicalParser; - +using Canon.Tests.Utils; +using Canon.Core.Abstractions; namespace Canon.Tests.LexicalParserTests { public class IdentifierTests { + private readonly ILexer _lexer = new Lexer(); + [Theory] [InlineData("identifier", true)] [InlineData("_identifier", true)] @@ -14,10 +17,9 @@ namespace Canon.Tests.LexicalParserTests [InlineData("andand", true)] public void TestParseIdentifier(string input, bool expectedResult) { - Lexer lexer = new(input); - List tokens = lexer.Tokenize(); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input)); + List tokens = tokensEnumerable.ToList(); - Assert.Single(tokens); Assert.Equal(expectedResult, tokens.FirstOrDefault()?.TokenType == SemanticTokenType.Identifier); } } diff --git a/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs b/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs index 725b83b..2f82779 100644 --- a/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs +++ b/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs @@ -1,10 +1,14 @@ using Canon.Core.Enums; using Canon.Core.LexicalParser; +using Canon.Tests.Utils; +using Canon.Core.Abstractions; namespace Canon.Tests.LexicalParserTests; public class KeywordTypeTests { + private readonly ILexer _lexer = new Lexer(); + [Theory] [InlineData("program", KeywordType.Program)] [InlineData("const", KeywordType.Const)] @@ -24,8 +28,8 @@ public class KeywordTypeTests [InlineData("DO", KeywordType.Do)] public void SmokeTest(string input, KeywordType type) { - Lexer lexer = new(input); - List tokens = lexer.Tokenize(); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input)); + List tokens = tokensEnumerable.ToList(); SemanticToken token = tokens[0]; Assert.Equal(SemanticTokenType.Keyword, token.TokenType); diff --git a/Canon.Tests/LexicalParserTests/LexicalFileTests.cs b/Canon.Tests/LexicalParserTests/LexicalFileTests.cs index 6027e35..f9b2978 100644 --- a/Canon.Tests/LexicalParserTests/LexicalFileTests.cs +++ b/Canon.Tests/LexicalParserTests/LexicalFileTests.cs @@ -3,12 +3,15 @@ using Canon.Core.Enums; using Canon.Core.Exceptions; using Canon.Core.LexicalParser; using Xunit.Abstractions; +using Canon.Tests.Utils; +using Canon.Core.Abstractions; namespace Canon.Tests.LexicalParserTests; public class LexicalFileTests { private readonly ITestOutputHelper _testOutputHelper; + private readonly ILexer _lexer = new Lexer(); public LexicalFileTests(ITestOutputHelper testOutputHelper) { @@ -126,14 +129,16 @@ public class LexicalFileTests } : token).ToList(); - var lexer = new Lexer(pascalProgram); - var actualTokens = lexer.Tokenize(); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + + var actualTokens = tokens; for (int i = 0; i < expectedTokens.Count; i++) { _testOutputHelper.WriteLine($"Expect: {expectedTokens[i]}"); _testOutputHelper.WriteLine($"Actual: {actualTokens[i]}"); _testOutputHelper.WriteLine("----"); - Assert.Equal(expectedTokens[i], actualTokens[i]); + // Assert.Equal(expectedTokens[i], actualTokens[i]); } Assert.Equal(expectedTokens, actualTokens); @@ -143,14 +148,14 @@ public class LexicalFileTests public void TestLexicalAnalysisFirst() { string pascalProgram = """ - program HelloWorld; - var - message: string; - begin - message := 'hello, world!'; - writeln(message); - end. - """; + program HelloWorld; + var + message: string; + begin + message := 'hello, world!'; + writeln(message); + end. + """; var stringLiterals = new List<(string, SemanticTokenType, int)> { @@ -182,14 +187,14 @@ public class LexicalFileTests public void TestLexicalAnalysisSecond() { string pascalProgram = """ - program main; - var - ab: integer; - begin - ab := 3; - write(ab); - end. - """; + program main; + var + ab: integer; + begin + ab := 3; + write(ab); + end. + """; var stringLiterals = new List<(string, SemanticTokenType, int)> { @@ -222,17 +227,17 @@ public class LexicalFileTests public void TestLexicalAnalysisThird() { string pascalProgram = """ - {test} - program main; - var - ab, ba: integer; - begin - ab := 3; - ba := 5; - ab := 5; - write(ab + ba); - end. - """; + {test} + program main; + var + ab, ba: integer; + begin + ab := 3; + ba := 5; + ab := 5; + write(ab + ba); + end. + """; var stringLiterals = new List<(string, SemanticTokenType, int)> { @@ -276,16 +281,15 @@ public class LexicalFileTests public void UnclosedCommentFirst() { string pascalProgram = """ - (* This is an example of an unclosed comment - program CommentError; - var - x: integer; - begin - x := 42; - end. - """; - var lexer = new Lexer(pascalProgram); - var ex = Assert.Throws(() => lexer.Tokenize()); + (* This is an example of an unclosed comment + program CommentError; + var + x: integer; + begin + x := 42; + end. + """; + var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(pascalProgram)).ToList()); //打印exception信息 _testOutputHelper.WriteLine(ex.ToString()); Assert.Equal(LexemeErrorType.UnclosedComment, ex.ErrorType); @@ -302,11 +306,108 @@ public class LexicalFileTests program CommentNotClosed; """; - var lexer = new Lexer(pascalProgram); - var ex = Assert.Throws(() => lexer.Tokenize()); -_testOutputHelper.WriteLine(ex.ToString()); + var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(pascalProgram)).ToList()); + _testOutputHelper.WriteLine(ex.ToString()); Assert.Equal(LexemeErrorType.UnclosedComment, ex.ErrorType); Assert.Equal((uint)4, ex.Line); Assert.Equal((uint)26, ex.CharPosition); } + + [Fact] + public void ClosedCommentFirst() + { + string pascalProgram = """ + program exFunction; + var + a, b, ret : integer; + + begin + a := 100; + b := 200; + (* calling a function to get max value + *) + ret := a - b; + + + + end. + """; + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + Assert.NotNull(tokens); + } + + [Fact] + public void ClosedCommentSecond() + { + string pascalProgram = """ + program exFunction; + var + a, b, ret : integer; + + begin + a := 100; + b := 200; + (* calling a function to get max valued *) + ret := a - b; + + + + end. + """; + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + Assert.NotNull(tokens); + } + + + [Fact] + public void ClosedCommentThird() + { + string pascalProgram = """ + { + This is a block comment that does closed. + } + program CommentClosed; + """; + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + Assert.NotNull(tokens); + } + + [Fact] + public void ClosedCommentFourth() + { + string pascalProgram = """ + {} + program CommentClosed; + """; + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + Assert.NotNull(tokens); + } + + [Fact] + public void ClosedCommentFifth() + { + string pascalProgram = """ + { + } + program CommentClosed; + """; + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + Assert.NotNull(tokens); + } + + [Fact] + public void ClosedCommentSixth() + { + string pascalProgram = """ + (**) + """; + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram)); + List tokens = tokensEnumerable.ToList(); + Assert.NotNull(tokens); + } } diff --git a/Canon.Tests/LexicalParserTests/NumberTests.cs b/Canon.Tests/LexicalParserTests/NumberTests.cs index 28764bf..7b815d4 100644 --- a/Canon.Tests/LexicalParserTests/NumberTests.cs +++ b/Canon.Tests/LexicalParserTests/NumberTests.cs @@ -2,12 +2,14 @@ using Canon.Core.LexicalParser; using Canon.Core.Exceptions; using Xunit.Abstractions; - +using Canon.Tests.Utils; +using Canon.Core.Abstractions; namespace Canon.Tests.LexicalParserTests { public class NumberTests { + private readonly ILexer _lexer = new Lexer(); private readonly ITestOutputHelper _testOutputHelper; public NumberTests(ITestOutputHelper testOutputHelper) { @@ -31,8 +33,8 @@ namespace Canon.Tests.LexicalParserTests [InlineData("$123", "0x123", NumberType.Hex)] public void TestParseNumber(string input, string expected, NumberType expectedNumberType) { - Lexer lexer = new(input); - List tokens = lexer.Tokenize(); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input)); + List tokens = tokensEnumerable.ToList(); SemanticToken token = tokens[0]; Assert.Equal(SemanticTokenType.Number, token.TokenType); NumberSemanticToken numberSemanticToken = (NumberSemanticToken)token; @@ -41,14 +43,13 @@ namespace Canon.Tests.LexicalParserTests } [Theory] - [InlineData("1E", 1, 3, LexemeErrorType.IllegalNumberFormat)] + [InlineData("1E", 1, 2, LexemeErrorType.IllegalNumberFormat)] [InlineData("123abc", 1, 4, LexemeErrorType.IllegalNumberFormat)] [InlineData("123.45.67", 1, 7, LexemeErrorType.IllegalNumberFormat)] [InlineData("123identifier", 1, 4, LexemeErrorType.IllegalNumberFormat)] public void TestParseNumberError(string input, uint expectedLine, uint expectedCharPosition, LexemeErrorType expectedErrorType) { - Lexer lexer = new(input); - var ex = Assert.Throws(() => lexer.Tokenize()); + var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(input)).ToList()); _testOutputHelper.WriteLine(ex.ToString()); Assert.Equal(expectedErrorType, ex.ErrorType); Assert.Equal(expectedLine, ex.Line); diff --git a/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs b/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs index c9fa587..a32e261 100644 --- a/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs +++ b/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs @@ -1,10 +1,13 @@ using Canon.Core.Enums; using Canon.Core.LexicalParser; - +using Canon.Tests.Utils; +using Canon.Core.Abstractions; namespace Canon.Tests.LexicalParserTests; public class OperatorTypeTests { + private readonly ILexer _lexer = new Lexer(); + [Theory] [InlineData("+ 123", OperatorType.Plus, true)] [InlineData("+123", OperatorType.Plus, true)] @@ -22,8 +25,8 @@ public class OperatorTypeTests [InlineData("m +123", OperatorType.Plus, false)] public void ParseTest(string input, OperatorType result, bool expectedResult) { - Lexer lexer = new(input); - List tokens = lexer.Tokenize(); + IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input)); + List tokens = tokensEnumerable.ToList(); SemanticToken token = tokens[0]; if (!expectedResult) diff --git a/Canon.Tests/Utils/EnumerableExtensions.cs b/Canon.Tests/Utils/EnumerableExtensions.cs new file mode 100644 index 0000000..3c7b0cd --- /dev/null +++ b/Canon.Tests/Utils/EnumerableExtensions.cs @@ -0,0 +1,15 @@ +namespace Canon.Tests.Utils; + +public static class EnumerableExtensions +{ + /// + /// 含有索引的遍历 + /// + /// 可遍历的接口 + /// + /// + public static IEnumerable<(T, uint)> WithIndex(this IEnumerable enumerable) + { + return enumerable.Select((value, index) => (value, (uint)index)); + } +} diff --git a/Canon.Tests/Utils/StringSourceReader.cs b/Canon.Tests/Utils/StringSourceReader.cs index f83ea4d..4626ec7 100644 --- a/Canon.Tests/Utils/StringSourceReader.cs +++ b/Canon.Tests/Utils/StringSourceReader.cs @@ -6,10 +6,11 @@ namespace Canon.Tests.Utils; /// /// 从字符串中读取源代码 /// -public sealed class StringSourceReader(string source) : ISourceReader, IDisposable +public sealed class StringSourceReader(string source) : ISourceReader { - private readonly IEnumerator _enumerator = - source.GetEnumerator(); + private int _pos = -1; + + private uint _lastPos; public uint Line { get; private set; } = 1; @@ -17,31 +18,70 @@ public sealed class StringSourceReader(string source) : ISourceReader, IDisposab public string FileName => "string"; - public bool TryReadChar([NotNullWhen(true)] out char? c) + public char Current { - if (Pos != 0 || Line != 1) + get { - // 不是第一次读取 - if (_enumerator.Current == '\n') + if (_pos == -1) { - Pos = 0; - Line += 1; + throw new InvalidOperationException("Reader at before the start."); + } + else + { + return source[_pos]; } } + } - if (!_enumerator.MoveNext()) + public bool Retract() + { + if (_pos <= 0) + { + return false; + } + + _pos -= 1; + if (Current == '\n') + { + Line -= 1; + // TODO: 如果一直回退就完蛋了 + Pos = _lastPos; + } + else + { + Pos -= 1; + } + return true; + } + + public bool MoveNext() + { + if (_pos >= source.Length - 1) + { + return false; + } + + if (_pos != -1 && Current == '\n') + { + Line += 1; + _lastPos = Pos; + Pos = 0; + } + + _pos += 1; + Pos += 1; + return true; + } + + public bool TryPeekChar([NotNullWhen(true)] out char? c) + { + if (_pos >= source.Length - 1) { c = null; return false; } - Pos += 1; - c = _enumerator.Current; + c = source[_pos + 1]; return true; } - - public void Dispose() - { - _enumerator.Dispose(); - } } diff --git a/Canon.Tests/Utils/StringSourceReaderTests.cs b/Canon.Tests/Utils/StringSourceReaderTests.cs index 21c41fe..dda57e4 100644 --- a/Canon.Tests/Utils/StringSourceReaderTests.cs +++ b/Canon.Tests/Utils/StringSourceReaderTests.cs @@ -8,78 +8,64 @@ public class StringSourceReaderTests public void LineFeedTest() { ISourceReader reader = new StringSourceReader("program Main;\nbegin\nend.\n"); + reader.MoveNext(); - Assert.Equal(0u, reader.Pos); - Assert.Equal(1u, reader.Line); - - // program - Assert.True(reader.TryReadChar(out char? c)); - Assert.Equal('p', c); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out c)); - Assert.Equal(' ', c); - - // main; - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - Assert.True(reader.TryReadChar(out char? _)); - - Assert.True(reader.TryReadChar(out c)); - Assert.Equal('\n', c); - - // begin - for (uint i = 1; i <= 5; i++) - { - Assert.True(reader.TryReadChar(out char? _)); - Assert.Equal(i, reader.Pos); - Assert.Equal(2u, reader.Line); - } - - // \n - Assert.True(reader.TryReadChar(out c)); - Assert.Equal('\n', c); - - // end. - foreach (char i in "end.") - { - Assert.True(reader.TryReadChar(out c)); - Assert.Equal(i, c); - } + CheckLine(reader, "program Main;", 1); + reader.MoveNext(); + CheckLine(reader, "begin", 2); + reader.MoveNext(); + CheckLine(reader, "end.", 3); } [Fact] public void CarriageReturnLineFeedTest() { ISourceReader reader = new StringSourceReader("program Main;\r\nbegin\r\nend.\r\n"); + reader.MoveNext(); - // program Main; - foreach ((char value, uint index) in - "program Main;".Select((value, index) => (value, (uint)index))) + CheckLine(reader, "program Main;", 1); + reader.MoveNext(); + reader.MoveNext(); + CheckLine(reader, "begin", 2); + reader.MoveNext(); + reader.MoveNext(); + CheckLine(reader, "end.", 3); + } + + [Fact] + public void RetractTest() + { + ISourceReader reader = new StringSourceReader("test"); + reader.MoveNext(); + + Assert.Equal('t', reader.Current); + Assert.True(reader.MoveNext()); + Assert.Equal('e', reader.Current); + Assert.True(reader.Retract()); + Assert.Equal('t', reader.Current); + Assert.False(reader.Retract()); + } + + [Fact] + public void PeekTest() + { + ISourceReader reader = new StringSourceReader("peek"); + reader.MoveNext(); + + Assert.Equal('p', reader.Current); + Assert.True(reader.TryPeekChar(out char? c)); + Assert.Equal('e', c); + Assert.Equal('p', reader.Current); + } + + private static void CheckLine(ISourceReader reader, string line, uint lineNumber) + { + foreach ((char value, uint index) in line.WithIndex()) { - Assert.True(reader.TryReadChar(out char? c)); - Assert.Equal(value, c); + Assert.Equal(value, reader.Current); + Assert.Equal(lineNumber, reader.Line); Assert.Equal(index + 1, reader.Pos); - Assert.Equal(1u, reader.Line); - } - - Assert.True(reader.TryReadChar(out _)); - Assert.True(reader.TryReadChar(out _)); - - // begin - foreach ((char value, uint index) in - "begin".Select((value, index) => (value, (uint)index))) - { - Assert.True(reader.TryReadChar(out char? c)); - Assert.Equal(value, c); - Assert.Equal(index + 1, reader.Pos); - Assert.Equal(2u, reader.Line); + reader.MoveNext(); } } }