diff --git a/CanonSharp.Common/Abstractions/ISourceReader.cs b/CanonSharp.Common/Abstractions/ISourceReader.cs new file mode 100644 index 0000000..c53fa2f --- /dev/null +++ b/CanonSharp.Common/Abstractions/ISourceReader.cs @@ -0,0 +1,17 @@ +namespace CanonSharp.Common.Abstractions; + +public interface ISourceReader +{ + /// + /// 偷看一下下一个字符 + /// + /// 看到的下一个字符 + /// + public bool TryPeek(out char c); + + /// + /// 读取下一个字符 + /// + /// + public char Read(); +} diff --git a/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs b/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs index 6d52008..eaf844f 100644 --- a/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs +++ b/CanonSharp.Common/LexicalAnalyzer/DeterministicFiniteAutomation.cs @@ -1,11 +1,13 @@ namespace CanonSharp.Common.LexicalAnalyzer; -public class DeterministicState : IEquatable +public class DeterministicState(HashSet closure) : IEquatable { public Guid Id { get; } = Guid.NewGuid(); public Dictionary Transaction { get; } = []; + public HashSet Closure { get; } = closure; + public bool Equals(DeterministicState? other) => other is not null && Id.Equals(other.Id); public override bool Equals(object? obj) => obj is DeterministicState other && Equals(other); @@ -35,7 +37,7 @@ public class DeterministicFiniteAutomation HashSet finalStates = []; HashSet startClosure = nfa.Start.CalculateEmptyClosure(); - DeterministicState start = new(); + DeterministicState start = new(startClosure); map.Add(new NondeterministicStateSet(startClosure), start); queue.Enqueue(new Pair(startClosure, start)); @@ -50,7 +52,7 @@ public class DeterministicFiniteAutomation foreach (NondeterministicState state in pair.States) { - foreach (KeyValuePair> transaction in + foreach (KeyValuePair> transaction in state.Transactions.Where(p => !p.Key.IsEmpty)) { HashSet closure = []; @@ -64,16 +66,19 @@ public class DeterministicFiniteAutomation { n.UnionWith(closure); } - next.Add(transaction.Key.Char, closure); + else + { + next.Add(transaction.Key.Char, closure); + } } } - foreach (KeyValuePair> transaction in next) + foreach (KeyValuePair> transaction in next) { NondeterministicStateSet set = new(transaction.Value); if (!map.TryGetValue(set, out DeterministicState? nextState)) { - nextState = new DeterministicState(); + nextState = new DeterministicState(transaction.Value); map.Add(set, nextState); } diff --git a/CanonSharp.Common/LexicalAnalyzer/LexicalScanner.cs b/CanonSharp.Common/LexicalAnalyzer/LexicalScanner.cs new file mode 100644 index 0000000..caa8c2b --- /dev/null +++ b/CanonSharp.Common/LexicalAnalyzer/LexicalScanner.cs @@ -0,0 +1,92 @@ +using System.Diagnostics.CodeAnalysis; +using CanonSharp.Common.Abstractions; + +namespace CanonSharp.Common.LexicalAnalyzer; + +public class LexicalScanner( + DeterministicState startState, + Dictionary finalStateMap, + HashSet skippedTokens, + ISourceReader reader) +{ + private readonly DeterministicState _startState = startState; + + private readonly List _readHistory = []; + + private DeterministicState _currentState = startState; + + public bool TryRead([NotNullWhen(true)] out LexicalToken? token) + { + while (TryReadInternal(out token)) + { + if (!skippedTokens.Contains(token)) + { + return true; + } + } + + return false; + } + + private bool TryReadInternal([NotNullWhen(true)] out LexicalToken? token) + { + while (reader.TryPeek(out char c)) + { + if (_currentState.Transaction.TryGetValue(c, out DeterministicState? nextState)) + { + // 可以迁移到下一个状态 + _currentState = nextState; + _readHistory.Add(reader.Read()); + } + else + { + // 无法迁移到下一个状态 + if (!finalStateMap.TryGetValue(_currentState, out LexicalToken? possibleToken)) + { + throw new InvalidOperationException(); + } + + // 当前状态是终止状态 + token = new LexicalToken(possibleToken, new string(_readHistory.ToArray())); + + // 重置状态 + _readHistory.Clear(); + _currentState = _startState; + return true; + } + } + + // 当前状态是终止状态 + if (finalStateMap.TryGetValue(_currentState, out LexicalToken? possibleToken2)) + { + token = new LexicalToken(possibleToken2, new string(_readHistory.ToArray())); + + _readHistory.Clear(); + _currentState = _startState; + return true; + } + + if (!_currentState.Equals(_startState)) + { + throw new InvalidOperationException(); + } + + token = null; + return false; + } + + public static LexicalScannerBuilder CreateDefaultBuilder() + { + LexicalScannerBuilder builder = new(); + + builder.DefineToken(LexicalToken.LineBreaker); + builder.DefineToken(LexicalToken.WhiteSpace); + + builder.AddSkippedToken(LexicalToken.LineBreaker); + builder.AddSkippedToken(LexicalToken.WhiteSpace); + + return builder; + } + + public static LexicalScannerBuilder CreateEmptyBuilder() => new(); +} diff --git a/CanonSharp.Common/LexicalAnalyzer/LexicalScannerBuilder.cs b/CanonSharp.Common/LexicalAnalyzer/LexicalScannerBuilder.cs new file mode 100644 index 0000000..5cdf402 --- /dev/null +++ b/CanonSharp.Common/LexicalAnalyzer/LexicalScannerBuilder.cs @@ -0,0 +1,62 @@ +using CanonSharp.Common.Abstractions; + +namespace CanonSharp.Common.LexicalAnalyzer; + +public class LexicalScannerBuilder +{ + private readonly Dictionary _finalStateMap = []; + private readonly List _nondeterministicFiniteAutomations = []; + private readonly HashSet _skippedTokens = []; + + internal LexicalScannerBuilder() + { + + } + + public void DefineToken(LexicalToken token) + { + NondeterministicFiniteAutomation automation = token.Expression.Convert2Nfa(); + _nondeterministicFiniteAutomations.Add(automation); + + foreach (NondeterministicState state in automation.FinalStates) + { + _finalStateMap.Add(state, token); + } + } + + public void AddSkippedToken(LexicalToken token) => _skippedTokens.Add(token); + + public LexicalScanner Build(ISourceReader reader) + { + NondeterministicFiniteAutomation finaAutomation = Combine(); + DeterministicFiniteAutomation deterministicFiniteAutomation = + DeterministicFiniteAutomation.Create(finaAutomation); + + Dictionary finalTokenMap = []; + + foreach (DeterministicState state in deterministicFiniteAutomation.FinalStates) + { + finalTokenMap.Add(state, state.Closure + .Where(s => _finalStateMap.ContainsKey(s)) + .Select(s => _finalStateMap[s]) + .OrderByDescending(t => t.Priority) + .First()); + } + + return new LexicalScanner(deterministicFiniteAutomation.Start, finalTokenMap, _skippedTokens, reader); + } + + private NondeterministicFiniteAutomation Combine() + { + NondeterministicState head = new(); + NondeterministicFiniteAutomation result = new(head, []); + + foreach (NondeterministicFiniteAutomation automation in _nondeterministicFiniteAutomations) + { + head.AddTransaction(EmptyChar.Empty, automation.Start); + result.FinalStates.UnionWith(automation.FinalStates); + } + + return result; + } +} diff --git a/CanonSharp.Common/LexicalAnalyzer/LexicalToken.cs b/CanonSharp.Common/LexicalAnalyzer/LexicalToken.cs new file mode 100644 index 0000000..e6be9eb --- /dev/null +++ b/CanonSharp.Common/LexicalAnalyzer/LexicalToken.cs @@ -0,0 +1,56 @@ +using System.Globalization; + +namespace CanonSharp.Common.LexicalAnalyzer; + +public class LexicalToken : IEquatable +{ + private readonly Guid _tokenId; + + public RegularExpression Expression { get; } + + public int Priority { get; } + + public LexicalToken(RegularExpression expression, int priority) + { + _tokenId = Guid.NewGuid(); + Expression = expression; + Priority = priority; + LiteralValue = string.Empty; + } + + internal LexicalToken(LexicalToken definition, string literalValue) + { + _tokenId = definition._tokenId; + Expression = definition.Expression; + Priority = definition.Priority; + LiteralValue = literalValue; + } + + public string LiteralValue { get; } + + public bool Equals(LexicalToken? other) => other is not null && _tokenId == other._tokenId; + + public override bool Equals(object? obj) => obj is LexicalToken other && Equals(other); + + public override int GetHashCode() => _tokenId.GetHashCode(); + + public static bool operator ==(LexicalToken a, LexicalToken b) => a.Equals(b); + + public static bool operator !=(LexicalToken a, LexicalToken b) => !(a == b); + + /// + /// 匹配所有的空白字符 + /// + public static readonly LexicalToken WhiteSpace = new( + RegularExpression.Alternate( + RegularExpression.CharSetOf(c => char.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator), + RegularExpression.CharSetOf("\u0009\u000B\u000C")), int.MinValue); + + /// + /// 匹配所有的换行符 + /// + public static readonly LexicalToken LineBreaker = new( + RegularExpression.Alternate( + RegularExpression.CharSetOf("\u000D\u000A\u0085\u2028\u2029"), + RegularExpression.String("\r\n")), int.MinValue); +} diff --git a/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs b/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs index e051535..79a98d8 100644 --- a/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs +++ b/CanonSharp.Common/LexicalAnalyzer/RegularExpression.cs @@ -4,17 +4,76 @@ public abstract class RegularExpression { public abstract NondeterministicFiniteAutomation Convert2Nfa(); + /// + /// 匹配空字符串 + /// public static RegularExpression Empty => new EmptyExpression(); + /// + /// 匹配单个字符 + /// c + /// + /// + /// public static RegularExpression Single(char c) => new SymbolExpression(c); + /// + /// left|right + /// + /// + /// + /// public static RegularExpression Alternate(RegularExpression left, RegularExpression right) => new AlternationExpression(left, right); + /// + /// left-right + /// + /// + /// + /// public static RegularExpression Concatenate(RegularExpression first, RegularExpression second) => new ConcatenationExpression(first, second); + /// + /// inner* + /// + /// + /// public static RegularExpression Kleene(RegularExpression inner) => new KleeneExpression(inner); + + /// + /// value + /// + /// + /// + public static RegularExpression String(string value) => new StringExpression(value); + + public static RegularExpression CharSetOf(string value) => new CharSetExpression(value.ToCharArray()); + + public static RegularExpression CharSetOf(Func predicate) + => new CharSetExpression(Iterate(char.MinValue, char.MaxValue).Where(predicate).ToArray()); + + /// + /// [a-b] + /// + /// + /// + /// + public static RegularExpression Range(char a, char b) => new CharSetExpression(Iterate(a, b).ToArray()); + + private static IEnumerable Iterate(char a, char b) + { + for (char c = a; c <= b; c++) + { + if (c == char.MaxValue) + { + yield break; + } + + yield return c; + } + } } public class EmptyExpression : RegularExpression @@ -107,3 +166,59 @@ public class KleeneExpression(RegularExpression inner) : RegularExpression return new NondeterministicFiniteAutomation(final, [final]); } } + +public class CharSetExpression : RegularExpression +{ + public char[] Set { get; } + + public CharSetExpression(Span set) + { + if (set.Length == 0) + { + throw new InvalidOperationException(); + } + + Set = set.ToArray(); + } + + public override NondeterministicFiniteAutomation Convert2Nfa() + { + NondeterministicState start = new(); + NondeterministicState final = new(); + + foreach (char c in Set) + { + start.AddTransaction(new EmptyChar(c), final); + } + + return new NondeterministicFiniteAutomation(start, [final]); + } +} + +public class StringExpression : RegularExpression +{ + public string Word { get; } + + public StringExpression(string word) + { + if (string.IsNullOrEmpty(word)) + { + throw new InvalidOperationException(); + } + + Word = word; + } + + public override NondeterministicFiniteAutomation Convert2Nfa() + { + NondeterministicState start = new(); + NondeterministicState final = Word.Aggregate(start, (state, c) => + { + NondeterministicState next = new(); + state.AddTransaction(new EmptyChar(c), next); + return next; + }); + + return new NondeterministicFiniteAutomation(start, [final]); + } +} diff --git a/CanonSharp.Common/Reader/SourceReader.cs b/CanonSharp.Common/Reader/SourceReader.cs new file mode 100644 index 0000000..0c2b818 --- /dev/null +++ b/CanonSharp.Common/Reader/SourceReader.cs @@ -0,0 +1,78 @@ +using CanonSharp.Common.Abstractions; + +namespace CanonSharp.Common.Reader; + +public class SourceReader : ISourceReader +{ + private readonly StreamReader _reader; + private char? _lookAhead; + + public SourceReader(string filename) + { + FileInfo source = new(filename); + + if (!source.Exists) + { + throw new InvalidOperationException(); + } + + _reader = new StreamReader(filename); + } + + public char Read() + { + if (_lookAhead.HasValue) + { + char result = _lookAhead.Value; + _lookAhead = null; + return result; + } + + if (!TryFetchChar(out char c)) + { + throw new InvalidOperationException(); + } + + return c; + } + + public bool TryPeek(out char c) + { + if (_lookAhead.HasValue) + { + c = _lookAhead.Value; + return true; + } + + if (!TryFetchChar(out c)) + { + return false; + } + + _lookAhead = c; + return true; + } + + private readonly char[] _buffer = new char[1024]; + private int _length; + private int _count; + + private bool TryFetchChar(out char c) + { + if (_length == _count) + { + _length = _reader.Read(_buffer); + _count = 0; + } + + if (_length == 0) + { + c = char.MinValue; + return false; + } + + c = _buffer[_count]; + _count += 1; + return true; + } +} diff --git a/CanonSharp.Common/Reader/StringReader.cs b/CanonSharp.Common/Reader/StringReader.cs new file mode 100644 index 0000000..e7493ef --- /dev/null +++ b/CanonSharp.Common/Reader/StringReader.cs @@ -0,0 +1,33 @@ +using CanonSharp.Common.Abstractions; + +namespace CanonSharp.Common.Reader; + +public class StringReader(string source) : ISourceReader +{ + private int _pos; + + public char Read() + { + if (_pos >= source.Length) + { + throw new InvalidOperationException(); + } + + char result = source[_pos]; + _pos += 1; + return result; + + } + + public bool TryPeek(out char c) + { + if (_pos < source.Length) + { + c = source[_pos]; + return true; + } + + c = char.MinValue; + return false; + } +} diff --git a/CanonSharp.Tests/LexicalAnalyzerTests/ReaderTests.cs b/CanonSharp.Tests/LexicalAnalyzerTests/ReaderTests.cs new file mode 100644 index 0000000..01b6df7 --- /dev/null +++ b/CanonSharp.Tests/LexicalAnalyzerTests/ReaderTests.cs @@ -0,0 +1,25 @@ +using CanonSharp.Common.Abstractions; +using StringReader = CanonSharp.Common.Reader.StringReader; + +namespace CanonSharp.Tests.LexicalAnalyzerTests; + +public class ReaderTests +{ + [Fact] + public void StringReaderTest() + { + StringReader reader = new("ab"); + + Assert.True(reader.TryPeek(out char c)); + Assert.Equal('a', c); + Assert.True(reader.TryPeek(out c)); + Assert.Equal('a', c); + Assert.Equal('a', reader.Read()); + Assert.True(reader.TryPeek(out c)); + Assert.Equal('b', c); + Assert.True(reader.TryPeek(out c)); + Assert.Equal('b', c); + Assert.Equal('b', reader.Read()); + Assert.False(reader.TryPeek(out c)); + } +} diff --git a/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs b/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs index 8fb8820..da1b49e 100644 --- a/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs +++ b/CanonSharp.Tests/LexicalAnalyzerTests/RegularExpressionTests.cs @@ -43,6 +43,15 @@ public class RegularExpressionTests s => s.Transactions.ContainsKey(new EmptyChar('a'))); } + [Fact] + public void RangeTest() + { + RegularExpression expression = RegularExpression.Range('a', 'z'); + NondeterministicFiniteAutomation nfa = expression.Convert2Nfa(); + + Assert.Equal(26, nfa.Start.Transactions.Count); + } + [Fact] public void ConvertTest() { @@ -74,4 +83,22 @@ public class RegularExpressionTests Assert.Equal('a', map[new NondeterministicStateSet([key2, key1])]); } + + [Fact] + public void PrefixConvertTest() + { + RegularExpression expression = RegularExpression.Alternate( + RegularExpression.String("string"), + RegularExpression.String("string1")); + + NondeterministicFiniteAutomation nfa = expression.Convert2Nfa(); + DeterministicFiniteAutomation.Create(nfa); + } + + [Fact] + public void WhiteSpaceConvertTest() + { + NondeterministicFiniteAutomation nfa = LexicalToken.WhiteSpace.Expression.Convert2Nfa(); + DeterministicFiniteAutomation.Create(nfa); + } } diff --git a/CanonSharp.Tests/LexicalAnalyzerTests/ScanTests.cs b/CanonSharp.Tests/LexicalAnalyzerTests/ScanTests.cs new file mode 100644 index 0000000..a929b90 --- /dev/null +++ b/CanonSharp.Tests/LexicalAnalyzerTests/ScanTests.cs @@ -0,0 +1,71 @@ +using CanonSharp.Common.LexicalAnalyzer; +using StringReader = CanonSharp.Common.Reader.StringReader; + +namespace CanonSharp.Tests.LexicalAnalyzerTests; + +public class ScanTests +{ + [Fact] + public void ScanTest1() + { + LexicalScannerBuilder builder = LexicalScanner.CreateEmptyBuilder(); + + LexicalToken token1 = new(RegularExpression.String("ab"), 1); + builder.DefineToken(token1); + + StringReader reader = new("ab"); + LexicalScanner scanner = builder.Build(reader); + + Assert.True(scanner.TryRead(out LexicalToken? result)); + Assert.Equal(token1, result); + Assert.Equal("ab", result.LiteralValue); + } + + [Fact] + public void ScanTest2() + { + LexicalScannerBuilder builder = LexicalScanner.CreateDefaultBuilder(); + + LexicalToken stringKeyword = new(RegularExpression.String("string"), 100); + LexicalToken periodDelimiter = new(RegularExpression.Single('.'), 100); + LexicalToken semiColonDelimiter = new(RegularExpression.Single(';'), 100); + LexicalToken identifier = new(RegularExpression.Concatenate(RegularExpression.Range('a', 'z'), + RegularExpression.Kleene(RegularExpression.Range('a', 'z'))), 0); + LexicalToken assigner = new(RegularExpression.String(":="), 100); + builder.DefineToken(stringKeyword); + builder.DefineToken(periodDelimiter); + builder.DefineToken(semiColonDelimiter); + builder.DefineToken(identifier); + builder.DefineToken(assigner); + + StringReader reader = new(""" + string value := origin; + string abc := value. + """); + LexicalScanner scanner = builder.Build(reader); + Validate(scanner, [ + stringKeyword, + identifier, + assigner, + identifier, + semiColonDelimiter, + stringKeyword, + identifier, + assigner, + identifier, + periodDelimiter + ]); + + Assert.False(scanner.TryRead(out _)); + } + + private static void Validate(LexicalScanner scanner, IEnumerable expectedTokens) + { + foreach (LexicalToken token in expectedTokens) + { + Assert.True(scanner.TryRead(out LexicalToken? outToken)); + Assert.NotNull(outToken); + Assert.Equal(token, outToken); + } + } +}