diff --git a/Canon.Core/Abstractions/ISourceReader.cs b/Canon.Core/Abstractions/ISourceReader.cs
index 11336cb..7d3a04d 100644
--- a/Canon.Core/Abstractions/ISourceReader.cs
+++ b/Canon.Core/Abstractions/ISourceReader.cs
@@ -7,12 +7,7 @@ namespace Canon.Core.Abstractions;
///
public interface ISourceReader
{
- ///
- /// 尝试读取下一个字符
- ///
- /// 读取到的字符
- /// 是否成功读取
- public bool TryReadChar([NotNullWhen(true)] out char? c);
+ public char Current { get; }
///
/// 源文件名称
@@ -28,4 +23,23 @@ public interface ISourceReader
/// 当前读取字符的列号
///
public uint Pos { get; }
+
+ ///
+ /// 回退一个字符
+ ///
+ /// 回退是否成功
+ public bool Retract();
+
+ ///
+ /// 前进一个字符
+ ///
+ ///
+ public bool MoveNext();
+
+ ///
+ /// 读取下一个字符但是移进
+ ///
+ /// 读取到的下一个字符
+ /// 是否能够读取下一个字符
+ public bool TryPeekChar([NotNullWhen(true)] out char? c);
}
diff --git a/Canon.Core/Enums/SemanticEnums.cs b/Canon.Core/Enums/SemanticEnums.cs
index 13be0ec..05d0087 100644
--- a/Canon.Core/Enums/SemanticEnums.cs
+++ b/Canon.Core/Enums/SemanticEnums.cs
@@ -9,7 +9,6 @@ public enum SemanticTokenType
Identifier,
Character,
Empty,
- Error, // 加了一个错误token
///
/// 语法分析中的栈底符号
///
@@ -90,10 +89,15 @@ public enum NumberType
public enum StateType
{
+ Start,
+ Comment,
Word,
- Digit,
+ Num,
Delimiter,
- Operator
+ Operator,
+ BreakPoint,
+ Unknown,
+ Done
}
public enum BasicIdType
diff --git a/Canon.Core/LexicalParser/LexRules.cs b/Canon.Core/LexicalParser/LexRules.cs
new file mode 100644
index 0000000..2746679
--- /dev/null
+++ b/Canon.Core/LexicalParser/LexRules.cs
@@ -0,0 +1,83 @@
+namespace Canon.Core.LexicalParser;
+
+public static class LexRules
+{
+ // 保留关键字
+ private static readonly string[] _keywords =
+ [
+ "Program", "Const", "Var", "Procedure",
+ "Function", "Begin", "End", "Array",
+ "Of", "If", "Then", "Else",
+ "For", "To", "Do", "Integer",
+ "Real", "Boolean", "Character", "Divide",
+ "Not", "Mod", "And", "Or"
+ ];
+
+ private static readonly string[] _delimiter = [";", ",", ":", ".", "(", ")", "[", "]", "'", "\"", ".."];
+
+ private static readonly string[] _operator = ["=", "<>", "<", "<=", ">", ">=", "+", "-", "*", "/", ":="];
+
+ // 判断字符
+ public static bool IsDigit(char _ch) {
+ if (_ch >= '0' && _ch <= '9') return true;
+ return false;
+ }
+
+ public static bool IsHexDigit(char _ch)
+ {
+ if ((_ch >= '0' && _ch <= '9') || (_ch<= 'F' && _ch >= 'A')) return true;
+ return false;
+ }
+
+ public static bool IsLetter(char _ch) {
+ if ((_ch >= 'A' && _ch <= 'Z') || (_ch >= 'a' && _ch <= 'z' || _ch == '_')) {
+ return true;
+ }
+ return false;
+ }
+
+ public static bool IsKeyword(string tokenString)
+ {
+
+ foreach (var t in _keywords)
+ {
+ if (string.Equals(tokenString, t, StringComparison.OrdinalIgnoreCase)) return true;
+ }
+ return false;
+ }
+
+
+ public static bool IsDelimiter(char ch)
+ {
+ foreach (var delimiter in _delimiter)
+ {
+ if (delimiter.Contains(ch))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static bool IsOperator(char ch)
+ {
+ foreach (var o in _operator)
+ {
+ if (o.Contains(ch))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static bool IsBreakPoint(char ch)
+ {
+ if (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r')
+ {
+ return true;
+ }
+
+ return false;
+ }
+}
diff --git a/Canon.Core/LexicalParser/LexemeFactory.cs b/Canon.Core/LexicalParser/LexemeFactory.cs
new file mode 100644
index 0000000..c612dcd
--- /dev/null
+++ b/Canon.Core/LexicalParser/LexemeFactory.cs
@@ -0,0 +1,95 @@
+using Canon.Core.Enums;
+
+namespace Canon.Core.LexicalParser;
+
+public static class LexemeFactory
+{
+
+ public static SemanticToken MakeToken(SemanticTokenType tokenType,string literal,uint _line,uint _chPos)
+ {
+ SemanticToken? token;
+ switch (tokenType)
+ {
+ case SemanticTokenType.Character:
+ CharacterSemanticToken characterSemanticToken = new CharacterSemanticToken()
+ {
+ LinePos = _line, CharacterPos = _chPos, LiteralValue = literal,
+ };
+ token = characterSemanticToken;
+ break;
+ case SemanticTokenType.Identifier:
+ IdentifierSemanticToken identifierSemanticToken = new IdentifierSemanticToken()
+ {
+ LinePos = _line, CharacterPos = _chPos, LiteralValue = literal,
+ };
+ token = identifierSemanticToken;
+ break;
+ default:
+ throw new ArgumentOutOfRangeException(nameof(tokenType), tokenType, null);
+ }
+
+ return token;
+
+
+ }
+
+ public static KeywordSemanticToken MakeToken(KeywordType keywordType,string literal,uint _line,uint _chPos)
+ {
+ KeywordSemanticToken keywordSemanticToken = new KeywordSemanticToken
+ {
+ LinePos = _line,
+ CharacterPos = _chPos,
+ LiteralValue = literal,
+ KeywordType = keywordType
+ };
+ return keywordSemanticToken;
+ }
+
+ public static DelimiterSemanticToken MakeToken(DelimiterType delimiterType,string literal,uint _line,uint _chPos)
+ {
+ DelimiterSemanticToken delimiterSemanticToken = new DelimiterSemanticToken()
+ {
+ LinePos = _line,
+ CharacterPos = _chPos,
+ LiteralValue = literal,
+ DelimiterType = delimiterType
+ };
+ return delimiterSemanticToken;
+ }
+
+ public static NumberSemanticToken MakeToken(NumberType numberType,string literal,uint _line,uint _chPos)
+ {
+ string temp = literal;
+ string result;
+ if (numberType == NumberType.Hex)
+ {
+ result = string.Concat("0x", temp.AsSpan(1, temp.Length - 1));
+ }
+ else
+ {
+ result = temp;
+ }
+
+ NumberSemanticToken numberSemanticToken = new NumberSemanticToken()
+ {
+ LinePos = _line,
+ CharacterPos = _chPos,
+ LiteralValue = result,
+ NumberType = numberType
+ };
+ return numberSemanticToken;
+
+ }
+
+ public static OperatorSemanticToken MakeToken(OperatorType operatorType,string literal,uint _line,uint _chPos)
+ {
+ OperatorSemanticToken operatorSemanticToken = new OperatorSemanticToken()
+ {
+ LinePos = _line,
+ CharacterPos = _chPos,
+ LiteralValue = literal,
+ OperatorType = operatorType
+ };
+ return operatorSemanticToken;
+ }
+}
diff --git a/Canon.Core/LexicalParser/Lexer.cs b/Canon.Core/LexicalParser/Lexer.cs
index abab562..edcd595 100644
--- a/Canon.Core/LexicalParser/Lexer.cs
+++ b/Canon.Core/LexicalParser/Lexer.cs
@@ -1,50 +1,29 @@
-using System.Numerics;
using System.Text;
+using Canon.Core.Abstractions;
using Canon.Core.Enums;
using Canon.Core.Exceptions;
namespace Canon.Core.LexicalParser;
-public class Lexer(string source)
+public class Lexer : ILexer
{
-
- // 保留关键字
- private readonly string[] _keywords =
- [
- "Program", "Const", "Var", "Procedure",
- "Function", "Begin", "End", "Array",
- "Of", "If", "Then", "Else",
- "For", "To", "Do", "Integer",
- "Real", "Boolean", "Character", "Divide",
- "Not", "Mod", "And", "Or"
- ];
-
- private readonly string[] _delimiter = [";", ",", ":", ".", "(", ")", "[", "]", "'", "\"", ".."];
-
- private readonly string[] _operator = ["=", "<>", "<", "<=", ">", ">=", "+", "-", "*", "/", ":="];
+ // 记录token
+ private SemanticToken? _semanticToken;
+ private readonly StringBuilder _tokenBuilder = new();
+ private readonly List _tokens = [];
// 状态机
- private StateType _state;
+ private StateType _state = StateType.Start;
private char _ch;
+ private bool _finish;
- private LinkedList _token = new LinkedList();
-
- // bool save;
- // int saved_state;
- bool _finish;
-
-
- //缓冲区
- private readonly char[] _buffer = new char[2048];
-
- // int start_pos;
- private int _fwdPos;
-
- // 计数器
+ // 文件读取
+ private ISourceReader _reader;
private uint _line = 1;
private uint _chPos;
- private readonly Dictionary _tokenCount = new Dictionary
+ // Token统计信息
+ private readonly Dictionary _tokenCount = new()
{
{ SemanticTokenType.Keyword, 0 },
{ SemanticTokenType.Number, 0 },
@@ -52,264 +31,244 @@ public class Lexer(string source)
{ SemanticTokenType.Delimiter, 0 },
{ SemanticTokenType.Identifier, 0 },
{ SemanticTokenType.Character, 0 },
- { SemanticTokenType.Error, 0 },
{ SemanticTokenType.End, 0 }
};
- private readonly List _tokens = [];
-
- public List Tokenize()
+ public IEnumerable Tokenize(ISourceReader reader)
{
- // 缓冲区
- // start_pos = 0;
- _fwdPos = 0;
+ _reader = reader;
- // 状态机
- _finish = false;
-
- while (!_finish)
+ while (_state != StateType.Done)
{
- GetChar();
- GetNbc();
- if (_finish) break;
+ switch (_state)
+ {
+ case StateType.Start:
+ HandleStartState();
+ break;
+ case StateType.Comment:
+ if (_ch == '{')
+ {
+ HandleCommentStateBig();
+ }
+ else if (_ch == '*')
+ {
+ HandleCommentStateSmall();
+ }
+ else
+ {
+ HandleCommentSingleLine();
+ }
- _token = new LinkedList();
+ break;
+ case StateType.Num:
+ HandleNumState();
+ break;
+ case StateType.Word:
+ HandleWordState();
+ break;
+ case StateType.Delimiter:
+ HandleDelimiterState();
+ break;
+ case StateType.Operator:
+ HandleOperatorState();
+ break;
+ case StateType.BreakPoint:
+ while (LexRules.IsBreakPoint(_ch))
+ {
+ GetChar();
+ }
- if (IsLetter())
- {
- _state = StateType.Word;
+ Retract();
+ _state = StateType.Start;
+ break;
+ case StateType.Unknown:
+ throw new LexemeException(LexemeErrorType.UnknownCharacterOrString, _line, _chPos,
+ "Illegal lexeme.");
+ case StateType.Done:
+ break;
}
- else if(_ch == '.')
+ }
+
+ _tokens.Add(SemanticToken.End);
+ return _tokens;
+ }
+
+ private void HandleStartState()
+ {
+ // 初始化
+ ResetTokenBuilder();
+
+ // 读取首个字符
+ GetChar();
+
+ if (_finish)
+ {
+ _state = StateType.Done;
+ return;
+ }
+
+ // 根据首个字符判断可能的情况
+ if (_ch == '{') // 以 “{” 开头,为注释
+ {
+ _state = StateType.Comment;
+ }
+ else if (_ch == '(')
+ {
+ char nextChar = PeekNextChar();
+ if (nextChar == '*')
{
- char next = PeekNextChar();
- if (next >= '0' && next <= '9')
- {
- _state = StateType.Digit;
- }
- else
- {
- _state = StateType.Delimiter;
- }
+ GetChar();
+ _state = StateType.Comment;
}
- else if (IsDigit() || _ch == '$')
- {
- _state = StateType.Digit;
- }
- else if (IsDelimiter())
+ else
{
_state = StateType.Delimiter;
}
- else if (_ch == '{')
+ }
+ else if (_ch == '/')
+ {
+ char nextChar = PeekNextChar();
+ if (nextChar == '/')
{
- while (_ch != '}')
- {
- GetChar();
- if (_ch == '\n')
- {
- _line++;
- _chPos = 0;
- }
- if (_finish)
- {
- throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, "The comment is not closed.");
- }
-
- }
-
- continue;
+ GetChar();
+ _state = StateType.Comment;
}
else
{
_state = StateType.Operator;
}
-
- switch (_state)
- {
- case StateType.Word:
- while (IsDigit() || IsLetter())
- {
- Cat();
- GetChar();
- }
-
- Retract();
-
- if (IsKeyword())
- {
- KeywordType keywordType =
- KeywordSemanticToken.GetKeywordTypeByKeyword(LinkedListToString(_token.First));
- MakeToken(keywordType);
- }
- else
- {
- MakeToken(SemanticTokenType.Identifier);
- }
-
- break;
- case StateType.Digit:
- DealNumber();
- break;
- case StateType.Delimiter:
- Cat();
- switch (_ch)
- {
- case '.':
- {
- GetChar();
- if (_ch == '.')
- {
- Cat();
- MakeToken(DelimiterType.DoubleDots);
- break;
- }
-
- Retract();
- if (IsDot())
- {
- MakeToken(DelimiterType.Dot);
- }
- else
- {
- MakeToken(DelimiterType.Period);
- }
- }
- break;
- case '\'':
- case '\"':
- {
- // 重置_token,准备收集字符串内容
- _token = new LinkedList();
-
- GetChar(); // 移动到下一个字符,即字符串的第一个字符
- while (_ch != '\'' && _ch != '\"')
- {
- Cat(); // 收集字符
- GetChar(); // 移动到下一个字符
- if (_ch == '\n' || _finish)
- {
- throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos, "The String is not closed.");
- }
- }
-
- MakeToken(SemanticTokenType.Character); // 或其它适用于字符串字面量的SemanticTokenType
- _token = new LinkedList(); // 重置_token
-
- if (!(_ch == '\'' || _ch == '\"'))
- {
- throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos, "The String is not closed.");
- }
- }
- break;
- case ',':
- MakeToken(DelimiterType.Comma);
- break;
- case ':':
- char nextChar = PeekNextChar();
- if (nextChar == '=')
- {
- GetChar();
- Cat();
- MakeToken(OperatorType.Assign);
- }
- else
- {
- MakeToken(DelimiterType.Colon);
- }
-
- break;
- case ';':
- MakeToken(DelimiterType.Semicolon);
- break;
- case '(':
- char next = PeekNextChar();
- if (next == '*')
- {
- GetChar();
- bool commentClosed = false;
- while (!commentClosed)
- {
- GetNbc();
- GetChar();
- while (_ch != '*')
- {
- GetNbc();
- GetChar();
- if (_finish)
- {
- throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, "The comment is not closed.");
- }
- }
-
- GetChar();
- if (_finish)
- {
- throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos, "The comment is not closed.");
- }
-
- if (_ch == ')') commentClosed = true;
- }
- }
- else
- {
- MakeToken(DelimiterType.LeftParenthesis);
- }
-
- break;
- case ')':
- MakeToken(DelimiterType.RightParenthesis);
- break;
- case '[':
- MakeToken(DelimiterType.LeftSquareBracket);
- break;
- case ']':
- MakeToken(DelimiterType.RightSquareBracket);
- break;
- }
-
- break;
- case StateType.Operator:
- DealOther();
- break;
- default:
- throw new ArgumentOutOfRangeException();
- }
-
}
-
- return _tokens;
+ else if (_ch == '.') // 以 “.” 开头,可能是数字或分隔符
+ {
+ char next = PeekNextChar();
+ if (next is >= '0' and <= '9')
+ {
+ _state = StateType.Num;
+ }
+ else
+ {
+ _state = StateType.Delimiter;
+ }
+ }
+ else if (LexRules.IsLetter(_ch)) // 以字母开头,为关键字或标识符
+ {
+ _state = StateType.Word;
+ }
+ else if (LexRules.IsDigit(_ch) || _ch == '$') // 以数字或 “$” 开头,为数值
+ {
+ _state = StateType.Num;
+ }
+ else if (LexRules.IsDelimiter(_ch)) // 为分隔符
+ {
+ _state = StateType.Delimiter;
+ }
+ else if (LexRules.IsOperator(_ch)) // 为运算符
+ {
+ _state = StateType.Operator;
+ }
+ else if (LexRules.IsBreakPoint(_ch))
+ {
+ _state = StateType.BreakPoint;
+ }
+ else
+ {
+ _state = StateType.Unknown;
+ }
}
- private void DealNumber()
+ private void HandleCommentStateBig()
{
+ while (_ch != '}')
+ {
+ GetChar();
+
+ if (_finish)
+ {
+ throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos,
+ "The comment is not closed.");
+ }
+ }
+
+ _state = StateType.Start;
+ }
+
+ private void HandleCommentStateSmall()
+ {
+ bool commentClosed = false;
+ while (!commentClosed)
+ {
+ GetChar();
+ while (_ch != '*')
+ {
+ GetChar();
+ if (_finish)
+ {
+ throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos,
+ "The comment is not closed.");
+ }
+ }
+
+ GetChar();
+ if (_finish)
+ {
+ throw new LexemeException(LexemeErrorType.UnclosedComment, _line, _chPos,
+ "The comment is not closed.");
+ }
+
+ if (_ch == ')') commentClosed = true;
+ }
+
+ _state = StateType.Start;
+ }
+
+ private void HandleCommentSingleLine()
+ {
+ while (_ch != '\n')
+ {
+ GetChar();
+ }
+
+ _state = StateType.Start;
+ }
+
+ private void HandleWordState()
+ {
+ while (LexRules.IsDigit(_ch) || LexRules.IsLetter(_ch))
+ {
+ Cat();
+ GetChar();
+ }
+
+ Retract();
+
+ string tokenString = GetCurrentTokenString();
+ if (LexRules.IsKeyword(tokenString))
+ {
+ KeywordType keywordType =
+ KeywordSemanticToken.GetKeywordTypeByKeyword(GetCurrentTokenString());
+
+ _semanticToken = LexemeFactory.MakeToken(keywordType, tokenString, _line, _chPos);
+ }
+ else
+ {
+ _semanticToken = LexemeFactory.MakeToken(SemanticTokenType.Identifier, tokenString, _line, _chPos);
+ }
+
+ AddToTokens(_semanticToken);
+ _state = StateType.Start;
+ }
+
+ private void HandleNumState()
+ {
+ NumberType numberType = NumberType.Integer;
// 十六进制
if (_ch == '$')
{
- Cat();
-
- GetChar();
- while (!NumberShouldBreak())
- {
- // 假设IsHexDigit方法能够识别十六进制数字
- if (IsHexDigit())
- {
- Cat();
- GetChar();
- }
- else if(NumberShouldBreak())
- {
- break;
- }
- else
- {
- throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal hex numbers!");
- }
- }
- MakeToken(NumberType.Hex);
- return;
+ ProcessHex();
+ numberType = NumberType.Hex;
}
-
// 非十六进制
- if(IsDigit() || _ch == '.')
+ else if (LexRules.IsDigit(_ch) || _ch == '.')
{
while (!NumberShouldBreak())
{
@@ -321,61 +280,66 @@ public class Lexer(string source)
if (next == '.')
{
Retract();
+ _state = StateType.Delimiter;
break;
}
// 不是符号 “..”,进入小数点后的判断
- Cat(); // 记录“.”
+ Cat(); // 记录“.”
// “.”后不应为空,至少应该有一位小数
GetChar();
if (NumberShouldBreak())
{
- throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal numbers!");
+ throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos,
+ "Illegal numbers!");
}
// 读取小数点后的数字
while (!NumberShouldBreak())
{
- if (IsDigit())
+ if (LexRules.IsDigit(_ch))
{
Cat();
GetChar();
}
else if (_ch == 'e' || _ch == 'E')
{
- DealE();
+ ProcessE();
break;
}
- else if(NumberShouldBreak())
+ else if (NumberShouldBreak())
{
break;
}
else
{
- throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal number.");
+ throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos,
+ "Illegal number.");
}
}
- MakeToken(NumberType.Real);
- return;
+
+ numberType = NumberType.Real;
+ break;
}
// 不含小数部分,含科学计数法
if (_ch == 'e' || _ch == 'E')
{
- DealE();
- MakeToken(NumberType.Real);
- return;
+ ProcessE();
+ numberType = NumberType.Real;
+ break;
}
// 暂时为整数
- if (IsDigit())
+ if (LexRules.IsDigit(_ch))
{
Cat();
GetChar();
}
- else if(NumberShouldBreak())
+ else if (NumberShouldBreak())
{
+ numberType = NumberType.Integer;
break;
}
else
@@ -383,16 +347,44 @@ public class Lexer(string source)
throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos, "Illegal number.");
}
}
- MakeToken(NumberType.Integer);
}
+ _semanticToken = LexemeFactory.MakeToken(numberType, GetCurrentTokenString(),
+ _line, _chPos);
+ AddToTokens(_semanticToken);
+ _state = StateType.Start;
}
- private void DealE()
+ private void ProcessHex()
{
Cat();
GetChar();
- if (IsDigit() || _ch == '+' || _ch == '-')
+
+ while (!NumberShouldBreak())
+ {
+ // 假设IsHexDigit方法能够识别十六进制数字
+ if (LexRules.IsHexDigit(_ch))
+ {
+ Cat();
+ GetChar();
+ }
+ else if (NumberShouldBreak())
+ {
+ break;
+ }
+ else
+ {
+ throw new LexemeException(LexemeErrorType.IllegalNumberFormat, _line, _chPos,
+ "Illegal hex numbers!");
+ }
+ }
+ }
+
+ private void ProcessE()
+ {
+ Cat();
+ GetChar();
+ if (LexRules.IsDigit(_ch) || _ch == '+' || _ch == '-')
{
Cat();
}
@@ -405,7 +397,7 @@ public class Lexer(string source)
GetChar();
while (!NumberShouldBreak())
{
- if (IsDigit())
+ if (LexRules.IsDigit(_ch))
{
Cat();
GetChar();
@@ -419,7 +411,8 @@ public class Lexer(string source)
bool NumberShouldBreak()
{
- if (_ch == ' ' || _ch == '\n' || _ch == '\t' || _ch == '\r' || (IsDelimiter() && _ch!='.') || IsOperator() || _finish)
+ if (_ch == ' ' || _ch == '\n' || _ch == '\t' || _ch == '\r' || (LexRules.IsDelimiter(_ch) && _ch != '.') ||
+ LexRules.IsOperator(_ch) || _finish)
{
Retract();
return true;
@@ -428,18 +421,6 @@ public class Lexer(string source)
return false;
}
- private bool IsOperator()
- {
- foreach (var o in _operator)
- {
- if (o.Contains(_ch))
- {
- return true;
- }
- }
- return false;
- }
-
private bool IsDot()
{
if (_tokens.Count != 0)
@@ -447,33 +428,136 @@ public class Lexer(string source)
SemanticToken tokenBefore = _tokens.Last();
if (tokenBefore.TokenType == SemanticTokenType.Identifier) return true;
}
+
return false;
}
+ private void HandleDelimiterState()
+ {
+ Cat();
+ switch (_ch)
+ {
+ case '.':
+ {
+ GetChar();
+ if (_ch == '.')
+ {
+ Cat();
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.DoubleDots, "..", _line, _chPos);
+ break;
+ }
- private void DealOther()
+ Retract();
+ if (IsDot())
+ {
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.Dot, ".", _line, _chPos);
+ }
+ else
+ {
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.Period, ".", _line, _chPos);
+ }
+ }
+ break;
+ case '\'':
+ case '\"':
+ {
+ // 重置_token,准备收集字符串内容
+ ResetTokenBuilder();
+
+ GetChar(); // 移动到下一个字符,即字符串的第一个字符
+ while (_ch != '\'' && _ch != '\"')
+ {
+ Cat(); // 收集字符
+ GetChar(); // 移动到下一个字符
+ if (_ch == '\n' || _finish)
+ {
+ throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos,
+ "The String is not closed.");
+ }
+ }
+
+ _semanticToken = LexemeFactory.MakeToken(SemanticTokenType.Character,
+ GetCurrentTokenString(), _line, _chPos);
+
+ ResetTokenBuilder();
+
+ if (!(_ch == '\'' || _ch == '\"'))
+ {
+ throw new LexemeException(LexemeErrorType.UnclosedStringLiteral, _line, _chPos,
+ "The String is not closed.");
+ }
+ }
+ break;
+ case ',':
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.Comma, ",", _line, _chPos);
+
+ break;
+ case ':':
+ char nextChar = PeekNextChar();
+ if (nextChar == '=')
+ {
+ GetChar();
+ Cat();
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Assign, ":=", _line, _chPos);
+ }
+ else
+ {
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.Colon, ":", _line, _chPos);
+ }
+
+ break;
+ case ';':
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.Semicolon, ";", _line, _chPos);
+
+ break;
+ case '(':
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.LeftParenthesis, "(", _line, _chPos);
+ break;
+ case ')':
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.RightParenthesis, ")", _line, _chPos);
+
+ break;
+ case '[':
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.LeftSquareBracket, "[", _line, _chPos);
+
+ break;
+ case ']':
+ _semanticToken = LexemeFactory.MakeToken(DelimiterType.RightSquareBracket, "]", _line, _chPos);
+ break;
+ }
+
+ AddToTokens(_semanticToken);
+ _state = StateType.Start;
+ }
+
+ private void HandleOperatorState()
{
switch (_ch)
{
case '+': // 识别 +
Cat();
- MakeToken(OperatorType.Plus);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Plus, "+", _line, _chPos);
+ AddToTokens(_semanticToken);
break;
case '-': // 识别 -
Cat();
- MakeToken(OperatorType.Minus);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Minus, "-", _line, _chPos);
+ AddToTokens(_semanticToken);
break;
case '*': // 识别 *
Cat();
- MakeToken(OperatorType.Multiply);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Multiply, "*", _line, _chPos);
+ AddToTokens(_semanticToken);
break;
case '/': // 识别 /
Cat();
- MakeToken(OperatorType.Divide);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Divide, "/", _line, _chPos);
+ AddToTokens(_semanticToken);
break;
case '=':
Cat();
- MakeToken(OperatorType.Equal);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Equal, "=", _line, _chPos);
+ AddToTokens(_semanticToken);
break;
case '<':
Cat();
@@ -482,20 +566,24 @@ public class Lexer(string source)
{
// 识别 <=
Cat();
- MakeToken(OperatorType.LessEqual);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.LessEqual, "<=", _line, _chPos);
+ AddToTokens(_semanticToken);
}
- else if(_ch == '>')
+ else if (_ch == '>')
{
// 识别 <>
Cat();
- MakeToken(OperatorType.NotEqual);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.NotEqual, ">", _line, _chPos);
+ AddToTokens(_semanticToken);
}
else
{
// 识别 <
Retract();
- MakeToken(OperatorType.Less);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Less, "<", _line, _chPos);
+ AddToTokens(_semanticToken);
}
+
break;
case '>':
Cat();
@@ -504,270 +592,83 @@ public class Lexer(string source)
{
// 识别 >=
Cat();
- MakeToken(OperatorType.GreaterEqual);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.GreaterEqual, ">=", _line, _chPos);
+ AddToTokens(_semanticToken);
}
else
{
// 识别 >
Retract();
- MakeToken(OperatorType.Greater);
+ _semanticToken = LexemeFactory.MakeToken(OperatorType.Greater, ">", _line, _chPos);
+ AddToTokens(_semanticToken);
}
+
break;
default:
throw new LexemeException(LexemeErrorType.UnknownCharacterOrString, _line, _chPos, "Illegal lexeme.");
}
+
+ _state = StateType.Start;
}
- private void MakeToken(SemanticTokenType tokenType)
+ private void AddToTokens(SemanticToken semanticToken)
{
- SemanticToken? token;
- if (_token.First == null)
- {
- Console.WriteLine("11");
- }
- switch (tokenType)
- {
- case SemanticTokenType.Character:
- CharacterSemanticToken characterSemanticToken = new CharacterSemanticToken()
- {
- LinePos = _line, CharacterPos = _chPos, LiteralValue = LinkedListToString(_token.First),
- };
- token = characterSemanticToken;
- break;
- case SemanticTokenType.Identifier:
- IdentifierSemanticToken identifierSemanticToken = new IdentifierSemanticToken()
- {
- LinePos = _line, CharacterPos = _chPos, LiteralValue = LinkedListToString(_token.First),
- };
- token = identifierSemanticToken;
- break;
- default:
- throw new ArgumentOutOfRangeException(nameof(tokenType), tokenType, null);
- }
-
- if (token != null)
- {
- _tokens.Add(token);
- _tokenCount[tokenType]++;
- Console.WriteLine($"<{tokenType}>");
- Console.WriteLine(LinkedListToString(_token.First));
- }
-
-
- }
-
- private void MakeToken(KeywordType keywordType)
- {
- KeywordSemanticToken keywordSemanticToken = new KeywordSemanticToken
- {
- LinePos = _line,
- CharacterPos = _chPos,
- LiteralValue = LinkedListToString(_token.First),
- KeywordType = keywordType
- };
- _tokens.Add(keywordSemanticToken);
- _tokenCount[SemanticTokenType.Keyword]++;
- Console.WriteLine($"<{SemanticTokenType.Keyword}> <{keywordType}>");
- Console.WriteLine(LinkedListToString(_token.First));
- }
-
- private void MakeToken(DelimiterType delimiterType)
- {
- DelimiterSemanticToken delimiterSemanticToken = new DelimiterSemanticToken()
- {
- LinePos = _line,
- CharacterPos = _chPos,
- LiteralValue = LinkedListToString(_token.First),
- DelimiterType = delimiterType
- };
- _tokens.Add(delimiterSemanticToken);
- _tokenCount[SemanticTokenType.Delimiter]++;
- Console.WriteLine($"<{SemanticTokenType.Delimiter}> <{delimiterType}>");
- Console.WriteLine(LinkedListToString(_token.First));
- }
-
- private void MakeToken(NumberType numberType)
- {
- string temp = LinkedListToString(_token.First);
- string result;
- if (numberType == NumberType.Hex)
- {
- result = string.Concat("0x", temp.AsSpan(1, temp.Length - 1));
- }
- else
- {
- result = temp;
- }
-
- NumberSemanticToken numberSemanticToken = new NumberSemanticToken()
- {
- LinePos = _line,
- CharacterPos = _chPos,
- LiteralValue = result,
- NumberType = numberType
- };
- _tokens.Add(numberSemanticToken);
- _tokenCount[SemanticTokenType.Number]++;
- Console.WriteLine($"<{SemanticTokenType.Number}> <{numberType}>");
- Console.WriteLine(LinkedListToString(_token.First));
- }
-
- private void MakeToken(OperatorType operatorType)
- {
- OperatorSemanticToken operatorSemanticToken = new OperatorSemanticToken()
- {
- LinePos = _line,
- CharacterPos = _chPos,
- LiteralValue = LinkedListToString(_token.First),
- OperatorType = operatorType
- };
- _tokens.Add(operatorSemanticToken);
- _tokenCount[SemanticTokenType.Operator]++;
- Console.WriteLine($"<{SemanticTokenType.Operator}> <{operatorType}>");
- Console.WriteLine(LinkedListToString(_token.First));
- }
-
- // 读取字符操作
- void GetChar() {
- if (_fwdPos >= 0 && _fwdPos < source.Length)
- {
- _ch = source[_fwdPos];
- _chPos++;
- _fwdPos++;
- }
- else if (_fwdPos == source.Length)
- {
- _ch = '\0';
- _chPos++;
- _finish = true;
- }
- }
-
- private void GetNbc() {
- while (_ch == ' ' || _ch == '\n' || _ch == '\t' || _ch == '\r') {
- if (_ch == '\n') {
- _line++;
- _chPos = 0;
- }
- GetChar();
- }
- }
-
- private void Retract() {
- _fwdPos -= 2;
- _chPos -= 2;
- GetChar();
+ _tokens.Add(semanticToken);
+ _tokenCount[semanticToken.TokenType]++;
+ Console.WriteLine($"<{semanticToken.TokenType}>");
+ Console.WriteLine(semanticToken.LiteralValue);
}
private void Cat()
{
- _token.AddLast(_ch);
- // cout << "加入" << ch << endl;
+ _tokenBuilder.Append(_ch); // 使用StringBuilder追加字符
}
- private string LinkedListToString(LinkedListNode first)
+ private string GetCurrentTokenString()
{
- // 使用 StringBuilder 来构建字符串
- StringBuilder sb = new StringBuilder();
- for (LinkedListNode node = first; node != null; node = node.Next)
- {
- sb.Append(node.Value);
- }
-
- // 将 StringBuilder 的内容转换为字符串
- string result = sb.ToString();
-
- return result;
+ return _tokenBuilder.ToString(); // 从StringBuilder获取当前记号的字符串
}
- // 判断字符
- private bool IsDigit() {
- if (_ch >= '0' && _ch <= '9') return true;
- return false;
- }
-
- private bool IsHexDigit()
+ private void ResetTokenBuilder()
{
- if ((_ch >= '0' && _ch <= '9') || (_ch<= 'F' && _ch >= 'A')) return true;
- return false;
- }
-
- private bool IsLetter() {
- if ((_ch >= 'A' && _ch <= 'Z') || (_ch >= 'a' && _ch <= 'z' || _ch == '_')) {
- return true;
- }
- return false;
- }
-
- private bool IsKeyword()
- {
- string tokenString = LinkedListToString(_token.First);
-
- foreach (var t in _keywords)
- {
- if (string.Equals(tokenString, t, StringComparison.OrdinalIgnoreCase)) return true;
- }
- return false;
- }
-
-
- private bool IsDelimiter()
- {
- foreach (var delimiter in _delimiter)
- {
- if (delimiter.Contains(_ch))
- {
- return true;
- }
- }
- return false;
+ _tokenBuilder.Clear(); // 清空StringBuilder以复用
}
private char PeekNextChar()
{
// 确认下一个位置是否仍在buffer的范围内
- if (_fwdPos < source.Length)
+ if (_reader.TryPeekChar(out char? c))
{
- return source[_fwdPos];
+ return c.Value;
}
- return '\0';
-
- }
-
-
-
- private void PrintToken(SemanticTokenType type, LinkedListNode token, uint line)
- {
- string tokenString = LinkedListToString(token);
- string typeName = Enum.GetName(typeof(SemanticTokenType), type) ?? "Unknown";
- Console.WriteLine($"{line} <{typeName.ToUpperInvariant()},{tokenString}>");
- }
-
- // PrintToken(SemanticTokenType.Keyword, "if", 42); // 假设'if'是token,42是行号
-
- private void PrintError(int type, LinkedListNode token, uint line)
- {
- string tokenString = LinkedListToString(token);
- switch (type)
+ else
{
- case 0:
- Console.WriteLine($"{line} ");
- break;
- case 1:
- Console.WriteLine($"{line} ");
- break;
+ return char.MinValue;
}
}
- // PrintError(0, "unexpected symbol", 42); // 假设 "unexpected symbol" 是错误的 token,42 是行号
-
- private void PrintResult()
+ void GetChar()
{
- Console.WriteLine(_line);
- foreach (var pair in _tokenCount)
+ if (_finish)
{
- Console.WriteLine($"{pair.Key}: {pair.Value}");
+ return;
}
+
+ _finish = !_reader.MoveNext();
+
+ if (_finish)
+ {
+ _ch = char.MinValue;
+ return;
+ }
+
+ _ch = _reader.Current;
+ _line = _reader.Line;
+ _chPos = _reader.Pos;
+ }
+
+ void Retract()
+ {
+ _reader.Retract();
}
}
-
diff --git a/Canon.Core/LexicalParser/SemanticToken.cs b/Canon.Core/LexicalParser/SemanticToken.cs
index 878630b..bc4b0c9 100644
--- a/Canon.Core/LexicalParser/SemanticToken.cs
+++ b/Canon.Core/LexicalParser/SemanticToken.cs
@@ -339,18 +339,4 @@ public class EndSemanticToken : SemanticToken
public override SemanticTokenType TokenType => SemanticTokenType.End;
}
-///
-/// 错误类型记号
-///
-public class ErrorSemanticToken : SemanticToken
-{
- public override SemanticTokenType TokenType => SemanticTokenType.Error;
-
- public static bool TryParse(uint linePos, uint characterPos, LinkedListNode now,
- out IdentifierSemanticToken? token)
- {
- token = null;
- return false;
- }
-}
diff --git a/Canon.Tests/CCodeGeneratorTests/BasicTests.cs b/Canon.Tests/CCodeGeneratorTests/BasicTests.cs
index 61fd5c5..0a291b0 100644
--- a/Canon.Tests/CCodeGeneratorTests/BasicTests.cs
+++ b/Canon.Tests/CCodeGeneratorTests/BasicTests.cs
@@ -3,12 +3,14 @@ using Canon.Core.CodeGenerators;
using Canon.Core.LexicalParser;
using Canon.Core.SyntaxNodes;
using Canon.Tests.GeneratedParserTests;
+using Canon.Tests.Utils;
namespace Canon.Tests.CCodeGeneratorTests;
public class BasicTests
{
private readonly IGrammarParser _parser = GeneratedGrammarParser.Instance;
+ private readonly ILexer _lexer = new Lexer();
[Fact]
public void ProgramStructTest()
@@ -21,9 +23,7 @@ public class BasicTests
end.
""";
- Lexer lexer = new(program);
- List tokens = lexer.Tokenize();
- tokens.Add(SemanticToken.End);
+ IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program));
ProgramStruct root = _parser.Analyse(tokens);
root.GenerateCCode(builder);
diff --git a/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs b/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs
index 797fe7f..9eb67e0 100644
--- a/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs
+++ b/Canon.Tests/GrammarParserTests/PascalGrammarTests.cs
@@ -4,12 +4,14 @@ using Canon.Core.GrammarParser;
using Canon.Core.LexicalParser;
using Canon.Core.SyntaxNodes;
using Canon.Tests.GeneratedParserTests;
+using Canon.Tests.Utils;
namespace Canon.Tests.GrammarParserTests;
public class PascalGrammarTests
{
private readonly IGrammarParser _parser = GeneratedGrammarParser.Instance;
+ private readonly ILexer _lexer = new Lexer();
[Fact]
public void DoNothingTest()
@@ -20,9 +22,7 @@ public class PascalGrammarTests
end.
""";
- Lexer lexer = new(program);
- List tokens = lexer.Tokenize();
- tokens.Add(SemanticToken.End);
+ IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program));
ProgramStruct root = _parser.Analyse(tokens);
Assert.Equal("DoNothing", root.Head.ProgramName.LiteralValue);
@@ -39,10 +39,7 @@ public class PascalGrammarTests
a := 1 + 1
end.
""";
-
- Lexer lexer = new(program);
- List tokens = lexer.Tokenize();
- tokens.Add(SemanticToken.End);
+ IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program));
ProgramStruct root = _parser.Analyse(tokens);
Assert.Equal("Add", root.Head.ProgramName.LiteralValue);
@@ -59,10 +56,7 @@ public class PascalGrammarTests
writeln( str, ret );
end.
""";
-
- Lexer lexer = new(program);
- List tokens = lexer.Tokenize();
- tokens.Add(SemanticToken.End);
+ IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program));
ProgramStruct root = _parser.Analyse(tokens);
Assert.Equal("exFunction", root.Head.ProgramName.LiteralValue);
@@ -79,10 +73,7 @@ public class PascalGrammarTests
begin
end.
""";
-
- Lexer lexer = new(program);
- List tokens = lexer.Tokenize();
- tokens.Add(SemanticToken.End);
+ IEnumerable tokens = _lexer.Tokenize(new StringSourceReader(program));
ProgramStruct root = _parser.Analyse(tokens);
Assert.Equal("main", root.Head.ProgramName.LiteralValue);
diff --git a/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs b/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs
index e37c692..6f20401 100644
--- a/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs
+++ b/Canon.Tests/LexicalParserTests/CharacterTypeTests.cs
@@ -2,13 +2,15 @@
using Canon.Core.LexicalParser;
using Xunit.Abstractions;
using Canon.Core.Exceptions;
+using Canon.Core.Abstractions;
+using Canon.Tests.Utils;
namespace Canon.Tests.LexicalParserTests
{
public class CharacterTypeTests
{
private readonly ITestOutputHelper _testOutputHelper;
-
+ private readonly ILexer _lexer = new Lexer();
public CharacterTypeTests(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
@@ -20,16 +22,15 @@ namespace Canon.Tests.LexicalParserTests
public void TestCharacterType(string input, string? expectedResult)
{
- Lexer lexer = new(input);
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input));
+ List tokens = tokensEnumerable.ToList();
if (expectedResult == null)
{
- Assert.Throws(() => lexer.Tokenize());
+ Assert.Throws(() => tokens);
}
else
{
- List tokens = lexer.Tokenize();
_testOutputHelper.WriteLine(tokens[0].LiteralValue);
- Assert.Single(tokens);
Assert.Equal(SemanticTokenType.Character, tokens[0].TokenType);
Assert.Equal(expectedResult, tokens[0].LiteralValue);
}
@@ -43,8 +44,8 @@ namespace Canon.Tests.LexicalParserTests
//[InlineData("\"x\'", 1, 3, LexemeException.LexemeErrorType.UnclosedStringLiteral)]
public void TestParseCharacterError(string input, uint expectedLine, uint expectedCharPosition, LexemeErrorType expectedErrorType)
{
- Lexer lexer = new(input);
- var ex = Assert.Throws(() => lexer.Tokenize());
+
+ var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(input)).ToList());
_testOutputHelper.WriteLine(ex.ToString());
Assert.Equal(expectedErrorType, ex.ErrorType);
Assert.Equal(expectedLine, ex.Line);
diff --git a/Canon.Tests/LexicalParserTests/DelimiterTests.cs b/Canon.Tests/LexicalParserTests/DelimiterTests.cs
index 934742f..cb6f620 100644
--- a/Canon.Tests/LexicalParserTests/DelimiterTests.cs
+++ b/Canon.Tests/LexicalParserTests/DelimiterTests.cs
@@ -1,10 +1,13 @@
using Canon.Core.Enums;
using Canon.Core.LexicalParser;
-
+using Canon.Tests.Utils;
+using Canon.Core.Abstractions;
namespace Canon.Tests.LexicalParserTests;
public class DelimiterTests
{
+ private readonly ILexer _lexer = new Lexer();
+
[Theory]
[InlineData(",123", DelimiterType.Comma)]
// [InlineData(".123", DelimiterType.Period)]
@@ -16,8 +19,8 @@ public class DelimiterTests
[InlineData("]asd", DelimiterType.RightSquareBracket)]
public void SmokeTest(string input, DelimiterType type)
{
- Lexer lexer = new(input);
- List tokens = lexer.Tokenize();
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input));
+ List tokens = tokensEnumerable.ToList();
SemanticToken token = tokens[0];
Assert.Equal(SemanticTokenType.Delimiter, token.TokenType);
diff --git a/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs b/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs
index bb4603a..bbc1a69 100644
--- a/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs
+++ b/Canon.Tests/LexicalParserTests/ErrorSingleTests.cs
@@ -2,11 +2,14 @@
using Canon.Core.Exceptions;
using Xunit.Abstractions;
using Canon.Core.Enums;
+using Canon.Core.Abstractions;
+using Canon.Tests.Utils;
namespace Canon.Tests.LexicalParserTests
{
public class ErrorSingleTests
{
+ private readonly ILexer _lexer = new Lexer();
private readonly ITestOutputHelper _testOutputHelper;
public ErrorSingleTests(ITestOutputHelper testOutputHelper)
{
@@ -20,9 +23,7 @@ namespace Canon.Tests.LexicalParserTests
[InlineData("identifier_with_special_chars@#",1, 30, LexemeErrorType.UnknownCharacterOrString)]
public void TestUnknownCharacterError(string pascalProgram, uint expectedLine, uint expectedCharPosition, LexemeErrorType expectedErrorType)
{
- var lexer = new Lexer(pascalProgram);
-
- var ex = Assert.Throws(() => lexer.Tokenize());
+ var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(pascalProgram)).ToList());
_testOutputHelper.WriteLine(ex.ToString());
Assert.Equal(expectedErrorType, ex.ErrorType);
Assert.Equal(expectedLine, ex.Line);
diff --git a/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs b/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs
index d453be9..a6ce635 100644
--- a/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs
+++ b/Canon.Tests/LexicalParserTests/IndentifierTypeTests.cs
@@ -1,10 +1,13 @@
using Canon.Core.Enums;
using Canon.Core.LexicalParser;
-
+using Canon.Tests.Utils;
+using Canon.Core.Abstractions;
namespace Canon.Tests.LexicalParserTests
{
public class IdentifierTests
{
+ private readonly ILexer _lexer = new Lexer();
+
[Theory]
[InlineData("identifier", true)]
[InlineData("_identifier", true)]
@@ -14,10 +17,9 @@ namespace Canon.Tests.LexicalParserTests
[InlineData("andand", true)]
public void TestParseIdentifier(string input, bool expectedResult)
{
- Lexer lexer = new(input);
- List tokens = lexer.Tokenize();
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input));
+ List tokens = tokensEnumerable.ToList();
- Assert.Single(tokens);
Assert.Equal(expectedResult, tokens.FirstOrDefault()?.TokenType == SemanticTokenType.Identifier);
}
}
diff --git a/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs b/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs
index 725b83b..2f82779 100644
--- a/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs
+++ b/Canon.Tests/LexicalParserTests/KeywordTypeTests.cs
@@ -1,10 +1,14 @@
using Canon.Core.Enums;
using Canon.Core.LexicalParser;
+using Canon.Tests.Utils;
+using Canon.Core.Abstractions;
namespace Canon.Tests.LexicalParserTests;
public class KeywordTypeTests
{
+ private readonly ILexer _lexer = new Lexer();
+
[Theory]
[InlineData("program", KeywordType.Program)]
[InlineData("const", KeywordType.Const)]
@@ -24,8 +28,8 @@ public class KeywordTypeTests
[InlineData("DO", KeywordType.Do)]
public void SmokeTest(string input, KeywordType type)
{
- Lexer lexer = new(input);
- List tokens = lexer.Tokenize();
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input));
+ List tokens = tokensEnumerable.ToList();
SemanticToken token = tokens[0];
Assert.Equal(SemanticTokenType.Keyword, token.TokenType);
diff --git a/Canon.Tests/LexicalParserTests/LexicalFileTests.cs b/Canon.Tests/LexicalParserTests/LexicalFileTests.cs
index 6027e35..f9b2978 100644
--- a/Canon.Tests/LexicalParserTests/LexicalFileTests.cs
+++ b/Canon.Tests/LexicalParserTests/LexicalFileTests.cs
@@ -3,12 +3,15 @@ using Canon.Core.Enums;
using Canon.Core.Exceptions;
using Canon.Core.LexicalParser;
using Xunit.Abstractions;
+using Canon.Tests.Utils;
+using Canon.Core.Abstractions;
namespace Canon.Tests.LexicalParserTests;
public class LexicalFileTests
{
private readonly ITestOutputHelper _testOutputHelper;
+ private readonly ILexer _lexer = new Lexer();
public LexicalFileTests(ITestOutputHelper testOutputHelper)
{
@@ -126,14 +129,16 @@ public class LexicalFileTests
}
: token).ToList();
- var lexer = new Lexer(pascalProgram);
- var actualTokens = lexer.Tokenize();
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+
+ var actualTokens = tokens;
for (int i = 0; i < expectedTokens.Count; i++)
{
_testOutputHelper.WriteLine($"Expect: {expectedTokens[i]}");
_testOutputHelper.WriteLine($"Actual: {actualTokens[i]}");
_testOutputHelper.WriteLine("----");
- Assert.Equal(expectedTokens[i], actualTokens[i]);
+ // Assert.Equal(expectedTokens[i], actualTokens[i]);
}
Assert.Equal(expectedTokens, actualTokens);
@@ -143,14 +148,14 @@ public class LexicalFileTests
public void TestLexicalAnalysisFirst()
{
string pascalProgram = """
- program HelloWorld;
- var
- message: string;
- begin
- message := 'hello, world!';
- writeln(message);
- end.
- """;
+ program HelloWorld;
+ var
+ message: string;
+ begin
+ message := 'hello, world!';
+ writeln(message);
+ end.
+ """;
var stringLiterals = new List<(string, SemanticTokenType, int)>
{
@@ -182,14 +187,14 @@ public class LexicalFileTests
public void TestLexicalAnalysisSecond()
{
string pascalProgram = """
- program main;
- var
- ab: integer;
- begin
- ab := 3;
- write(ab);
- end.
- """;
+ program main;
+ var
+ ab: integer;
+ begin
+ ab := 3;
+ write(ab);
+ end.
+ """;
var stringLiterals = new List<(string, SemanticTokenType, int)>
{
@@ -222,17 +227,17 @@ public class LexicalFileTests
public void TestLexicalAnalysisThird()
{
string pascalProgram = """
- {test}
- program main;
- var
- ab, ba: integer;
- begin
- ab := 3;
- ba := 5;
- ab := 5;
- write(ab + ba);
- end.
- """;
+ {test}
+ program main;
+ var
+ ab, ba: integer;
+ begin
+ ab := 3;
+ ba := 5;
+ ab := 5;
+ write(ab + ba);
+ end.
+ """;
var stringLiterals = new List<(string, SemanticTokenType, int)>
{
@@ -276,16 +281,15 @@ public class LexicalFileTests
public void UnclosedCommentFirst()
{
string pascalProgram = """
- (* This is an example of an unclosed comment
- program CommentError;
- var
- x: integer;
- begin
- x := 42;
- end.
- """;
- var lexer = new Lexer(pascalProgram);
- var ex = Assert.Throws(() => lexer.Tokenize());
+ (* This is an example of an unclosed comment
+ program CommentError;
+ var
+ x: integer;
+ begin
+ x := 42;
+ end.
+ """;
+ var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(pascalProgram)).ToList());
//打印exception信息
_testOutputHelper.WriteLine(ex.ToString());
Assert.Equal(LexemeErrorType.UnclosedComment, ex.ErrorType);
@@ -302,11 +306,108 @@ public class LexicalFileTests
program CommentNotClosed;
""";
- var lexer = new Lexer(pascalProgram);
- var ex = Assert.Throws(() => lexer.Tokenize());
-_testOutputHelper.WriteLine(ex.ToString());
+ var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(pascalProgram)).ToList());
+ _testOutputHelper.WriteLine(ex.ToString());
Assert.Equal(LexemeErrorType.UnclosedComment, ex.ErrorType);
Assert.Equal((uint)4, ex.Line);
Assert.Equal((uint)26, ex.CharPosition);
}
+
+ [Fact]
+ public void ClosedCommentFirst()
+ {
+ string pascalProgram = """
+ program exFunction;
+ var
+ a, b, ret : integer;
+
+ begin
+ a := 100;
+ b := 200;
+ (* calling a function to get max value
+ *)
+ ret := a - b;
+
+
+
+ end.
+ """;
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+ Assert.NotNull(tokens);
+ }
+
+ [Fact]
+ public void ClosedCommentSecond()
+ {
+ string pascalProgram = """
+ program exFunction;
+ var
+ a, b, ret : integer;
+
+ begin
+ a := 100;
+ b := 200;
+ (* calling a function to get max valued *)
+ ret := a - b;
+
+
+
+ end.
+ """;
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+ Assert.NotNull(tokens);
+ }
+
+
+ [Fact]
+ public void ClosedCommentThird()
+ {
+ string pascalProgram = """
+ {
+ This is a block comment that does closed.
+ }
+ program CommentClosed;
+ """;
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+ Assert.NotNull(tokens);
+ }
+
+ [Fact]
+ public void ClosedCommentFourth()
+ {
+ string pascalProgram = """
+ {}
+ program CommentClosed;
+ """;
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+ Assert.NotNull(tokens);
+ }
+
+ [Fact]
+ public void ClosedCommentFifth()
+ {
+ string pascalProgram = """
+ {
+ }
+ program CommentClosed;
+ """;
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+ Assert.NotNull(tokens);
+ }
+
+ [Fact]
+ public void ClosedCommentSixth()
+ {
+ string pascalProgram = """
+ (**)
+ """;
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(pascalProgram));
+ List tokens = tokensEnumerable.ToList();
+ Assert.NotNull(tokens);
+ }
}
diff --git a/Canon.Tests/LexicalParserTests/NumberTests.cs b/Canon.Tests/LexicalParserTests/NumberTests.cs
index 28764bf..7b815d4 100644
--- a/Canon.Tests/LexicalParserTests/NumberTests.cs
+++ b/Canon.Tests/LexicalParserTests/NumberTests.cs
@@ -2,12 +2,14 @@
using Canon.Core.LexicalParser;
using Canon.Core.Exceptions;
using Xunit.Abstractions;
-
+using Canon.Tests.Utils;
+using Canon.Core.Abstractions;
namespace Canon.Tests.LexicalParserTests
{
public class NumberTests
{
+ private readonly ILexer _lexer = new Lexer();
private readonly ITestOutputHelper _testOutputHelper;
public NumberTests(ITestOutputHelper testOutputHelper)
{
@@ -31,8 +33,8 @@ namespace Canon.Tests.LexicalParserTests
[InlineData("$123", "0x123", NumberType.Hex)]
public void TestParseNumber(string input, string expected, NumberType expectedNumberType)
{
- Lexer lexer = new(input);
- List tokens = lexer.Tokenize();
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input));
+ List tokens = tokensEnumerable.ToList();
SemanticToken token = tokens[0];
Assert.Equal(SemanticTokenType.Number, token.TokenType);
NumberSemanticToken numberSemanticToken = (NumberSemanticToken)token;
@@ -41,14 +43,13 @@ namespace Canon.Tests.LexicalParserTests
}
[Theory]
- [InlineData("1E", 1, 3, LexemeErrorType.IllegalNumberFormat)]
+ [InlineData("1E", 1, 2, LexemeErrorType.IllegalNumberFormat)]
[InlineData("123abc", 1, 4, LexemeErrorType.IllegalNumberFormat)]
[InlineData("123.45.67", 1, 7, LexemeErrorType.IllegalNumberFormat)]
[InlineData("123identifier", 1, 4, LexemeErrorType.IllegalNumberFormat)]
public void TestParseNumberError(string input, uint expectedLine, uint expectedCharPosition, LexemeErrorType expectedErrorType)
{
- Lexer lexer = new(input);
- var ex = Assert.Throws(() => lexer.Tokenize());
+ var ex = Assert.Throws(() => _lexer.Tokenize(new StringSourceReader(input)).ToList());
_testOutputHelper.WriteLine(ex.ToString());
Assert.Equal(expectedErrorType, ex.ErrorType);
Assert.Equal(expectedLine, ex.Line);
diff --git a/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs b/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs
index c9fa587..a32e261 100644
--- a/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs
+++ b/Canon.Tests/LexicalParserTests/OperatorTypeTests.cs
@@ -1,10 +1,13 @@
using Canon.Core.Enums;
using Canon.Core.LexicalParser;
-
+using Canon.Tests.Utils;
+using Canon.Core.Abstractions;
namespace Canon.Tests.LexicalParserTests;
public class OperatorTypeTests
{
+ private readonly ILexer _lexer = new Lexer();
+
[Theory]
[InlineData("+ 123", OperatorType.Plus, true)]
[InlineData("+123", OperatorType.Plus, true)]
@@ -22,8 +25,8 @@ public class OperatorTypeTests
[InlineData("m +123", OperatorType.Plus, false)]
public void ParseTest(string input, OperatorType result, bool expectedResult)
{
- Lexer lexer = new(input);
- List tokens = lexer.Tokenize();
+ IEnumerable tokensEnumerable = _lexer.Tokenize(new StringSourceReader(input));
+ List tokens = tokensEnumerable.ToList();
SemanticToken token = tokens[0];
if (!expectedResult)
diff --git a/Canon.Tests/Utils/EnumerableExtensions.cs b/Canon.Tests/Utils/EnumerableExtensions.cs
new file mode 100644
index 0000000..3c7b0cd
--- /dev/null
+++ b/Canon.Tests/Utils/EnumerableExtensions.cs
@@ -0,0 +1,15 @@
+namespace Canon.Tests.Utils;
+
+public static class EnumerableExtensions
+{
+ ///
+ /// 含有索引的遍历
+ ///
+ /// 可遍历的接口
+ ///
+ ///
+ public static IEnumerable<(T, uint)> WithIndex(this IEnumerable enumerable)
+ {
+ return enumerable.Select((value, index) => (value, (uint)index));
+ }
+}
diff --git a/Canon.Tests/Utils/StringSourceReader.cs b/Canon.Tests/Utils/StringSourceReader.cs
index f83ea4d..4626ec7 100644
--- a/Canon.Tests/Utils/StringSourceReader.cs
+++ b/Canon.Tests/Utils/StringSourceReader.cs
@@ -6,10 +6,11 @@ namespace Canon.Tests.Utils;
///
/// 从字符串中读取源代码
///
-public sealed class StringSourceReader(string source) : ISourceReader, IDisposable
+public sealed class StringSourceReader(string source) : ISourceReader
{
- private readonly IEnumerator _enumerator =
- source.GetEnumerator();
+ private int _pos = -1;
+
+ private uint _lastPos;
public uint Line { get; private set; } = 1;
@@ -17,31 +18,70 @@ public sealed class StringSourceReader(string source) : ISourceReader, IDisposab
public string FileName => "string";
- public bool TryReadChar([NotNullWhen(true)] out char? c)
+ public char Current
{
- if (Pos != 0 || Line != 1)
+ get
{
- // 不是第一次读取
- if (_enumerator.Current == '\n')
+ if (_pos == -1)
{
- Pos = 0;
- Line += 1;
+ throw new InvalidOperationException("Reader at before the start.");
+ }
+ else
+ {
+ return source[_pos];
}
}
+ }
- if (!_enumerator.MoveNext())
+ public bool Retract()
+ {
+ if (_pos <= 0)
+ {
+ return false;
+ }
+
+ _pos -= 1;
+ if (Current == '\n')
+ {
+ Line -= 1;
+ // TODO: 如果一直回退就完蛋了
+ Pos = _lastPos;
+ }
+ else
+ {
+ Pos -= 1;
+ }
+ return true;
+ }
+
+ public bool MoveNext()
+ {
+ if (_pos >= source.Length - 1)
+ {
+ return false;
+ }
+
+ if (_pos != -1 && Current == '\n')
+ {
+ Line += 1;
+ _lastPos = Pos;
+ Pos = 0;
+ }
+
+ _pos += 1;
+ Pos += 1;
+ return true;
+ }
+
+ public bool TryPeekChar([NotNullWhen(true)] out char? c)
+ {
+ if (_pos >= source.Length - 1)
{
c = null;
return false;
}
- Pos += 1;
- c = _enumerator.Current;
+ c = source[_pos + 1];
return true;
}
-
- public void Dispose()
- {
- _enumerator.Dispose();
- }
}
diff --git a/Canon.Tests/Utils/StringSourceReaderTests.cs b/Canon.Tests/Utils/StringSourceReaderTests.cs
index 21c41fe..dda57e4 100644
--- a/Canon.Tests/Utils/StringSourceReaderTests.cs
+++ b/Canon.Tests/Utils/StringSourceReaderTests.cs
@@ -8,78 +8,64 @@ public class StringSourceReaderTests
public void LineFeedTest()
{
ISourceReader reader = new StringSourceReader("program Main;\nbegin\nend.\n");
+ reader.MoveNext();
- Assert.Equal(0u, reader.Pos);
- Assert.Equal(1u, reader.Line);
-
- // program
- Assert.True(reader.TryReadChar(out char? c));
- Assert.Equal('p', c);
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out c));
- Assert.Equal(' ', c);
-
- // main;
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
- Assert.True(reader.TryReadChar(out char? _));
-
- Assert.True(reader.TryReadChar(out c));
- Assert.Equal('\n', c);
-
- // begin
- for (uint i = 1; i <= 5; i++)
- {
- Assert.True(reader.TryReadChar(out char? _));
- Assert.Equal(i, reader.Pos);
- Assert.Equal(2u, reader.Line);
- }
-
- // \n
- Assert.True(reader.TryReadChar(out c));
- Assert.Equal('\n', c);
-
- // end.
- foreach (char i in "end.")
- {
- Assert.True(reader.TryReadChar(out c));
- Assert.Equal(i, c);
- }
+ CheckLine(reader, "program Main;", 1);
+ reader.MoveNext();
+ CheckLine(reader, "begin", 2);
+ reader.MoveNext();
+ CheckLine(reader, "end.", 3);
}
[Fact]
public void CarriageReturnLineFeedTest()
{
ISourceReader reader = new StringSourceReader("program Main;\r\nbegin\r\nend.\r\n");
+ reader.MoveNext();
- // program Main;
- foreach ((char value, uint index) in
- "program Main;".Select((value, index) => (value, (uint)index)))
+ CheckLine(reader, "program Main;", 1);
+ reader.MoveNext();
+ reader.MoveNext();
+ CheckLine(reader, "begin", 2);
+ reader.MoveNext();
+ reader.MoveNext();
+ CheckLine(reader, "end.", 3);
+ }
+
+ [Fact]
+ public void RetractTest()
+ {
+ ISourceReader reader = new StringSourceReader("test");
+ reader.MoveNext();
+
+ Assert.Equal('t', reader.Current);
+ Assert.True(reader.MoveNext());
+ Assert.Equal('e', reader.Current);
+ Assert.True(reader.Retract());
+ Assert.Equal('t', reader.Current);
+ Assert.False(reader.Retract());
+ }
+
+ [Fact]
+ public void PeekTest()
+ {
+ ISourceReader reader = new StringSourceReader("peek");
+ reader.MoveNext();
+
+ Assert.Equal('p', reader.Current);
+ Assert.True(reader.TryPeekChar(out char? c));
+ Assert.Equal('e', c);
+ Assert.Equal('p', reader.Current);
+ }
+
+ private static void CheckLine(ISourceReader reader, string line, uint lineNumber)
+ {
+ foreach ((char value, uint index) in line.WithIndex())
{
- Assert.True(reader.TryReadChar(out char? c));
- Assert.Equal(value, c);
+ Assert.Equal(value, reader.Current);
+ Assert.Equal(lineNumber, reader.Line);
Assert.Equal(index + 1, reader.Pos);
- Assert.Equal(1u, reader.Line);
- }
-
- Assert.True(reader.TryReadChar(out _));
- Assert.True(reader.TryReadChar(out _));
-
- // begin
- foreach ((char value, uint index) in
- "begin".Select((value, index) => (value, (uint)index)))
- {
- Assert.True(reader.TryReadChar(out char? c));
- Assert.Equal(value, c);
- Assert.Equal(index + 1, reader.Pos);
- Assert.Equal(2u, reader.Line);
+ reader.MoveNext();
}
}
}