diff --git a/CanonSharp.Common/Abstractions/ISourceReader.cs b/CanonSharp.Common/Abstractions/ISourceReader.cs
new file mode 100644
index 0000000..c53fa2f
--- /dev/null
+++ b/CanonSharp.Common/Abstractions/ISourceReader.cs
@@ -0,0 +1,17 @@
+namespace CanonSharp.Common.Abstractions;
+
+public interface ISourceReader
+{
+ ///
+ /// 偷看一下下一个字符
+ ///
+ /// 看到的下一个字符
+ ///
+ public bool TryPeek(out char c);
+
+ ///
+ /// 读取下一个字符
+ ///
+ ///
+ public char Read();
+}
diff --git a/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs b/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs
index 6d52008..eaf844f 100644
--- a/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs
+++ b/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs
@@ -1,11 +1,13 @@
namespace CanonSharp.Common.LexicalAnalyzer;
-public class DeterministicState : IEquatable
+public class DeterministicState(HashSet closure) : IEquatable
{
public Guid Id { get; } = Guid.NewGuid();
public Dictionary Transaction { get; } = [];
+ public HashSet Closure { get; } = closure;
+
public bool Equals(DeterministicState? other) => other is not null && Id.Equals(other.Id);
public override bool Equals(object? obj) => obj is DeterministicState other && Equals(other);
@@ -35,7 +37,7 @@ public class DeterministicFiniteAutomation
HashSet finalStates = [];
HashSet startClosure = nfa.Start.CalculateEmptyClosure();
- DeterministicState start = new();
+ DeterministicState start = new(startClosure);
map.Add(new NondeterministicStateSet(startClosure), start);
queue.Enqueue(new Pair(startClosure, start));
@@ -50,7 +52,7 @@ public class DeterministicFiniteAutomation
foreach (NondeterministicState state in pair.States)
{
- foreach (KeyValuePair> transaction in
+ foreach (KeyValuePair> transaction in
state.Transactions.Where(p => !p.Key.IsEmpty))
{
HashSet closure = [];
@@ -64,16 +66,19 @@ public class DeterministicFiniteAutomation
{
n.UnionWith(closure);
}
- next.Add(transaction.Key.Char, closure);
+ else
+ {
+ next.Add(transaction.Key.Char, closure);
+ }
}
}
- foreach (KeyValuePair> transaction in next)
+ foreach (KeyValuePair> transaction in next)
{
NondeterministicStateSet set = new(transaction.Value);
if (!map.TryGetValue(set, out DeterministicState? nextState))
{
- nextState = new DeterministicState();
+ nextState = new DeterministicState(transaction.Value);
map.Add(set, nextState);
}
diff --git a/CanonSharp.Common/LexicalAnalyzer/LexicalScanner.cs b/CanonSharp.Common/LexicalAnalyzer/LexicalScanner.cs
new file mode 100644
index 0000000..caa8c2b
--- /dev/null
+++ b/CanonSharp.Common/LexicalAnalyzer/LexicalScanner.cs
@@ -0,0 +1,92 @@
+using System.Diagnostics.CodeAnalysis;
+using CanonSharp.Common.Abstractions;
+
+namespace CanonSharp.Common.LexicalAnalyzer;
+
+public class LexicalScanner(
+ DeterministicState startState,
+ Dictionary finalStateMap,
+ HashSet skippedTokens,
+ ISourceReader reader)
+{
+ private readonly DeterministicState _startState = startState;
+
+ private readonly List _readHistory = [];
+
+ private DeterministicState _currentState = startState;
+
+ public bool TryRead([NotNullWhen(true)] out LexicalToken? token)
+ {
+ while (TryReadInternal(out token))
+ {
+ if (!skippedTokens.Contains(token))
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private bool TryReadInternal([NotNullWhen(true)] out LexicalToken? token)
+ {
+ while (reader.TryPeek(out char c))
+ {
+ if (_currentState.Transaction.TryGetValue(c, out DeterministicState? nextState))
+ {
+ // 可以迁移到下一个状态
+ _currentState = nextState;
+ _readHistory.Add(reader.Read());
+ }
+ else
+ {
+ // 无法迁移到下一个状态
+ if (!finalStateMap.TryGetValue(_currentState, out LexicalToken? possibleToken))
+ {
+ throw new InvalidOperationException();
+ }
+
+ // 当前状态是终止状态
+ token = new LexicalToken(possibleToken, new string(_readHistory.ToArray()));
+
+ // 重置状态
+ _readHistory.Clear();
+ _currentState = _startState;
+ return true;
+ }
+ }
+
+ // 当前状态是终止状态
+ if (finalStateMap.TryGetValue(_currentState, out LexicalToken? possibleToken2))
+ {
+ token = new LexicalToken(possibleToken2, new string(_readHistory.ToArray()));
+
+ _readHistory.Clear();
+ _currentState = _startState;
+ return true;
+ }
+
+ if (!_currentState.Equals(_startState))
+ {
+ throw new InvalidOperationException();
+ }
+
+ token = null;
+ return false;
+ }
+
+ public static LexicalScannerBuilder CreateDefaultBuilder()
+ {
+ LexicalScannerBuilder builder = new();
+
+ builder.DefineToken(LexicalToken.LineBreaker);
+ builder.DefineToken(LexicalToken.WhiteSpace);
+
+ builder.AddSkippedToken(LexicalToken.LineBreaker);
+ builder.AddSkippedToken(LexicalToken.WhiteSpace);
+
+ return builder;
+ }
+
+ public static LexicalScannerBuilder CreateEmptyBuilder() => new();
+}
diff --git a/CanonSharp.Common/LexicalAnalyzer/LexicalScannerBuilder.cs b/CanonSharp.Common/LexicalAnalyzer/LexicalScannerBuilder.cs
new file mode 100644
index 0000000..5cdf402
--- /dev/null
+++ b/CanonSharp.Common/LexicalAnalyzer/LexicalScannerBuilder.cs
@@ -0,0 +1,62 @@
+using CanonSharp.Common.Abstractions;
+
+namespace CanonSharp.Common.LexicalAnalyzer;
+
+public class LexicalScannerBuilder
+{
+ private readonly Dictionary _finalStateMap = [];
+ private readonly List _nondeterministicFiniteAutomations = [];
+ private readonly HashSet _skippedTokens = [];
+
+ internal LexicalScannerBuilder()
+ {
+
+ }
+
+ public void DefineToken(LexicalToken token)
+ {
+ NondeterministicFiniteAutomation automation = token.Expression.Convert2Nfa();
+ _nondeterministicFiniteAutomations.Add(automation);
+
+ foreach (NondeterministicState state in automation.FinalStates)
+ {
+ _finalStateMap.Add(state, token);
+ }
+ }
+
+ public void AddSkippedToken(LexicalToken token) => _skippedTokens.Add(token);
+
+ public LexicalScanner Build(ISourceReader reader)
+ {
+ NondeterministicFiniteAutomation finaAutomation = Combine();
+ DeterministicFiniteAutomation deterministicFiniteAutomation =
+ DeterministicFiniteAutomation.Create(finaAutomation);
+
+ Dictionary finalTokenMap = [];
+
+ foreach (DeterministicState state in deterministicFiniteAutomation.FinalStates)
+ {
+ finalTokenMap.Add(state, state.Closure
+ .Where(s => _finalStateMap.ContainsKey(s))
+ .Select(s => _finalStateMap[s])
+ .OrderByDescending(t => t.Priority)
+ .First());
+ }
+
+ return new LexicalScanner(deterministicFiniteAutomation.Start, finalTokenMap, _skippedTokens, reader);
+ }
+
+ private NondeterministicFiniteAutomation Combine()
+ {
+ NondeterministicState head = new();
+ NondeterministicFiniteAutomation result = new(head, []);
+
+ foreach (NondeterministicFiniteAutomation automation in _nondeterministicFiniteAutomations)
+ {
+ head.AddTransaction(EmptyChar.Empty, automation.Start);
+ result.FinalStates.UnionWith(automation.FinalStates);
+ }
+
+ return result;
+ }
+}
diff --git a/CanonSharp.Common/LexicalAnalyzer/LexicalToken.cs b/CanonSharp.Common/LexicalAnalyzer/LexicalToken.cs
new file mode 100644
index 0000000..e6be9eb
--- /dev/null
+++ b/CanonSharp.Common/LexicalAnalyzer/LexicalToken.cs
@@ -0,0 +1,56 @@
+using System.Globalization;
+
+namespace CanonSharp.Common.LexicalAnalyzer;
+
+public class LexicalToken : IEquatable
+{
+ private readonly Guid _tokenId;
+
+ public RegularExpression Expression { get; }
+
+ public int Priority { get; }
+
+ public LexicalToken(RegularExpression expression, int priority)
+ {
+ _tokenId = Guid.NewGuid();
+ Expression = expression;
+ Priority = priority;
+ LiteralValue = string.Empty;
+ }
+
+ internal LexicalToken(LexicalToken definition, string literalValue)
+ {
+ _tokenId = definition._tokenId;
+ Expression = definition.Expression;
+ Priority = definition.Priority;
+ LiteralValue = literalValue;
+ }
+
+ public string LiteralValue { get; }
+
+ public bool Equals(LexicalToken? other) => other is not null && _tokenId == other._tokenId;
+
+ public override bool Equals(object? obj) => obj is LexicalToken other && Equals(other);
+
+ public override int GetHashCode() => _tokenId.GetHashCode();
+
+ public static bool operator ==(LexicalToken a, LexicalToken b) => a.Equals(b);
+
+ public static bool operator !=(LexicalToken a, LexicalToken b) => !(a == b);
+
+ ///
+ /// 匹配所有的空白字符
+ ///
+ public static readonly LexicalToken WhiteSpace = new(
+ RegularExpression.Alternate(
+ RegularExpression.CharSetOf(c => char.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator),
+ RegularExpression.CharSetOf("\u0009\u000B\u000C")), int.MinValue);
+
+ ///
+ /// 匹配所有的换行符
+ ///
+ public static readonly LexicalToken LineBreaker = new(
+ RegularExpression.Alternate(
+ RegularExpression.CharSetOf("\u000D\u000A\u0085\u2028\u2029"),
+ RegularExpression.String("\r\n")), int.MinValue);
+}
diff --git a/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs b/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs
index e051535..79a98d8 100644
--- a/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs
+++ b/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs
@@ -4,17 +4,76 @@ public abstract class RegularExpression
{
public abstract NondeterministicFiniteAutomation Convert2Nfa();
+ ///
+ /// 匹配空字符串
+ ///
public static RegularExpression Empty => new EmptyExpression();
+ ///
+ /// 匹配单个字符
+ /// c
+ ///
+ ///
+ ///
public static RegularExpression Single(char c) => new SymbolExpression(c);
+ ///
+ /// left|right
+ ///
+ ///
+ ///
+ ///
public static RegularExpression Alternate(RegularExpression left, RegularExpression right) =>
new AlternationExpression(left, right);
+ ///
+ /// left-right
+ ///
+ ///
+ ///
+ ///
public static RegularExpression Concatenate(RegularExpression first, RegularExpression second) =>
new ConcatenationExpression(first, second);
+ ///
+ /// inner*
+ ///
+ ///
+ ///
public static RegularExpression Kleene(RegularExpression inner) => new KleeneExpression(inner);
+
+ ///
+ /// value
+ ///
+ ///
+ ///
+ public static RegularExpression String(string value) => new StringExpression(value);
+
+ public static RegularExpression CharSetOf(string value) => new CharSetExpression(value.ToCharArray());
+
+ public static RegularExpression CharSetOf(Func predicate)
+ => new CharSetExpression(Iterate(char.MinValue, char.MaxValue).Where(predicate).ToArray());
+
+ ///
+ /// [a-b]
+ ///
+ ///
+ ///
+ ///
+ public static RegularExpression Range(char a, char b) => new CharSetExpression(Iterate(a, b).ToArray());
+
+ private static IEnumerable Iterate(char a, char b)
+ {
+ for (char c = a; c <= b; c++)
+ {
+ if (c == char.MaxValue)
+ {
+ yield break;
+ }
+
+ yield return c;
+ }
+ }
}
public class EmptyExpression : RegularExpression
@@ -107,3 +166,59 @@ public class KleeneExpression(RegularExpression inner) : RegularExpression
return new NondeterministicFiniteAutomation(final, [final]);
}
}
+
+public class CharSetExpression : RegularExpression
+{
+ public char[] Set { get; }
+
+ public CharSetExpression(Span set)
+ {
+ if (set.Length == 0)
+ {
+ throw new InvalidOperationException();
+ }
+
+ Set = set.ToArray();
+ }
+
+ public override NondeterministicFiniteAutomation Convert2Nfa()
+ {
+ NondeterministicState start = new();
+ NondeterministicState final = new();
+
+ foreach (char c in Set)
+ {
+ start.AddTransaction(new EmptyChar(c), final);
+ }
+
+ return new NondeterministicFiniteAutomation(start, [final]);
+ }
+}
+
+public class StringExpression : RegularExpression
+{
+ public string Word { get; }
+
+ public StringExpression(string word)
+ {
+ if (string.IsNullOrEmpty(word))
+ {
+ throw new InvalidOperationException();
+ }
+
+ Word = word;
+ }
+
+ public override NondeterministicFiniteAutomation Convert2Nfa()
+ {
+ NondeterministicState start = new();
+ NondeterministicState final = Word.Aggregate(start, (state, c) =>
+ {
+ NondeterministicState next = new();
+ state.AddTransaction(new EmptyChar(c), next);
+ return next;
+ });
+
+ return new NondeterministicFiniteAutomation(start, [final]);
+ }
+}
diff --git a/CanonSharp.Common/Reader/SourceReader.cs b/CanonSharp.Common/Reader/SourceReader.cs
new file mode 100644
index 0000000..0c2b818
--- /dev/null
+++ b/CanonSharp.Common/Reader/SourceReader.cs
@@ -0,0 +1,78 @@
+using CanonSharp.Common.Abstractions;
+
+namespace CanonSharp.Common.Reader;
+
+public class SourceReader : ISourceReader
+{
+ private readonly StreamReader _reader;
+ private char? _lookAhead;
+
+ public SourceReader(string filename)
+ {
+ FileInfo source = new(filename);
+
+ if (!source.Exists)
+ {
+ throw new InvalidOperationException();
+ }
+
+ _reader = new StreamReader(filename);
+ }
+
+ public char Read()
+ {
+ if (_lookAhead.HasValue)
+ {
+ char result = _lookAhead.Value;
+ _lookAhead = null;
+ return result;
+ }
+
+ if (!TryFetchChar(out char c))
+ {
+ throw new InvalidOperationException();
+ }
+
+ return c;
+ }
+
+ public bool TryPeek(out char c)
+ {
+ if (_lookAhead.HasValue)
+ {
+ c = _lookAhead.Value;
+ return true;
+ }
+
+ if (!TryFetchChar(out c))
+ {
+ return false;
+ }
+
+ _lookAhead = c;
+ return true;
+ }
+
+ private readonly char[] _buffer = new char[1024];
+ private int _length;
+ private int _count;
+
+ private bool TryFetchChar(out char c)
+ {
+ if (_length == _count)
+ {
+ _length = _reader.Read(_buffer);
+ _count = 0;
+ }
+
+ if (_length == 0)
+ {
+ c = char.MinValue;
+ return false;
+ }
+
+ c = _buffer[_count];
+ _count += 1;
+ return true;
+ }
+}
diff --git a/CanonSharp.Common/Reader/StringReader.cs b/CanonSharp.Common/Reader/StringReader.cs
new file mode 100644
index 0000000..e7493ef
--- /dev/null
+++ b/CanonSharp.Common/Reader/StringReader.cs
@@ -0,0 +1,33 @@
+using CanonSharp.Common.Abstractions;
+
+namespace CanonSharp.Common.Reader;
+
+public class StringReader(string source) : ISourceReader
+{
+ private int _pos;
+
+ public char Read()
+ {
+ if (_pos >= source.Length)
+ {
+ throw new InvalidOperationException();
+ }
+
+ char result = source[_pos];
+ _pos += 1;
+ return result;
+
+ }
+
+ public bool TryPeek(out char c)
+ {
+ if (_pos < source.Length)
+ {
+ c = source[_pos];
+ return true;
+ }
+
+ c = char.MinValue;
+ return false;
+ }
+}
diff --git a/CanonSharp.Tests/LexicalAnalyzerTests/ReaderTests.cs b/CanonSharp.Tests/LexicalAnalyzerTests/ReaderTests.cs
new file mode 100644
index 0000000..01b6df7
--- /dev/null
+++ b/CanonSharp.Tests/LexicalAnalyzerTests/ReaderTests.cs
@@ -0,0 +1,25 @@
+using CanonSharp.Common.Abstractions;
+using StringReader = CanonSharp.Common.Reader.StringReader;
+
+namespace CanonSharp.Tests.LexicalAnalyzerTests;
+
+public class ReaderTests
+{
+ [Fact]
+ public void StringReaderTest()
+ {
+ StringReader reader = new("ab");
+
+ Assert.True(reader.TryPeek(out char c));
+ Assert.Equal('a', c);
+ Assert.True(reader.TryPeek(out c));
+ Assert.Equal('a', c);
+ Assert.Equal('a', reader.Read());
+ Assert.True(reader.TryPeek(out c));
+ Assert.Equal('b', c);
+ Assert.True(reader.TryPeek(out c));
+ Assert.Equal('b', c);
+ Assert.Equal('b', reader.Read());
+ Assert.False(reader.TryPeek(out c));
+ }
+}
diff --git a/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs b/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs
index 8fb8820..da1b49e 100644
--- a/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs
+++ b/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs
@@ -43,6 +43,15 @@ public class RegularExpressionTests
s => s.Transactions.ContainsKey(new EmptyChar('a')));
}
+ [Fact]
+ public void RangeTest()
+ {
+ RegularExpression expression = RegularExpression.Range('a', 'z');
+ NondeterministicFiniteAutomation nfa = expression.Convert2Nfa();
+
+ Assert.Equal(26, nfa.Start.Transactions.Count);
+ }
+
[Fact]
public void ConvertTest()
{
@@ -74,4 +83,22 @@ public class RegularExpressionTests
Assert.Equal('a', map[new NondeterministicStateSet([key2, key1])]);
}
+
+ [Fact]
+ public void PrefixConvertTest()
+ {
+ RegularExpression expression = RegularExpression.Alternate(
+ RegularExpression.String("string"),
+ RegularExpression.String("string1"));
+
+ NondeterministicFiniteAutomation nfa = expression.Convert2Nfa();
+ DeterministicFiniteAutomation.Create(nfa);
+ }
+
+ [Fact]
+ public void WhiteSpaceConvertTest()
+ {
+ NondeterministicFiniteAutomation nfa = LexicalToken.WhiteSpace.Expression.Convert2Nfa();
+ DeterministicFiniteAutomation.Create(nfa);
+ }
}
diff --git a/CanonSharp.Tests/LexicalAnalyzerTests/ScanTests.cs b/CanonSharp.Tests/LexicalAnalyzerTests/ScanTests.cs
new file mode 100644
index 0000000..a929b90
--- /dev/null
+++ b/CanonSharp.Tests/LexicalAnalyzerTests/ScanTests.cs
@@ -0,0 +1,71 @@
+using CanonSharp.Common.LexicalAnalyzer;
+using StringReader = CanonSharp.Common.Reader.StringReader;
+
+namespace CanonSharp.Tests.LexicalAnalyzerTests;
+
+public class ScanTests
+{
+ [Fact]
+ public void ScanTest1()
+ {
+ LexicalScannerBuilder builder = LexicalScanner.CreateEmptyBuilder();
+
+ LexicalToken token1 = new(RegularExpression.String("ab"), 1);
+ builder.DefineToken(token1);
+
+ StringReader reader = new("ab");
+ LexicalScanner scanner = builder.Build(reader);
+
+ Assert.True(scanner.TryRead(out LexicalToken? result));
+ Assert.Equal(token1, result);
+ Assert.Equal("ab", result.LiteralValue);
+ }
+
+ [Fact]
+ public void ScanTest2()
+ {
+ LexicalScannerBuilder builder = LexicalScanner.CreateDefaultBuilder();
+
+ LexicalToken stringKeyword = new(RegularExpression.String("string"), 100);
+ LexicalToken periodDelimiter = new(RegularExpression.Single('.'), 100);
+ LexicalToken semiColonDelimiter = new(RegularExpression.Single(';'), 100);
+ LexicalToken identifier = new(RegularExpression.Concatenate(RegularExpression.Range('a', 'z'),
+ RegularExpression.Kleene(RegularExpression.Range('a', 'z'))), 0);
+ LexicalToken assigner = new(RegularExpression.String(":="), 100);
+ builder.DefineToken(stringKeyword);
+ builder.DefineToken(periodDelimiter);
+ builder.DefineToken(semiColonDelimiter);
+ builder.DefineToken(identifier);
+ builder.DefineToken(assigner);
+
+ StringReader reader = new("""
+ string value := origin;
+ string abc := value.
+ """);
+ LexicalScanner scanner = builder.Build(reader);
+ Validate(scanner, [
+ stringKeyword,
+ identifier,
+ assigner,
+ identifier,
+ semiColonDelimiter,
+ stringKeyword,
+ identifier,
+ assigner,
+ identifier,
+ periodDelimiter
+ ]);
+
+ Assert.False(scanner.TryRead(out _));
+ }
+
+ private static void Validate(LexicalScanner scanner, IEnumerable expectedTokens)
+ {
+ foreach (LexicalToken token in expectedTokens)
+ {
+ Assert.True(scanner.TryRead(out LexicalToken? outToken));
+ Assert.NotNull(outToken);
+ Assert.Equal(token, outToken);
+ }
+ }
+}