feat: 正则词法识别器 (#1)

Reviewed-on: https://git.bupt-hpc.cn/jackfiled/CanonSharp/pulls/1
Co-authored-by: jackfiled <xcrenchangjun@outlook.com>
Co-committed-by: jackfiled <xcrenchangjun@outlook.com>
This commit is contained in:
jackfiled 2024-07-29 16:59:29 +08:00 committed by 任昌骏
parent 6ff8622906
commit 3c0d51cec5
11 changed files with 587 additions and 6 deletions

View File

@ -0,0 +1,17 @@
namespace CanonSharp.Common.Abstractions;
public interface ISourceReader
{
/// <summary>
/// 偷看一下下一个字符
/// </summary>
/// <param name="c">看到的下一个字符</param>
/// <returns></returns>
public bool TryPeek(out char c);
/// <summary>
/// 读取下一个字符
/// </summary>
/// <returns></returns>
public char Read();
}

View File

@ -1,11 +1,13 @@
namespace CanonSharp.Common.LexicalAnalyzer;
public class DeterministicState : IEquatable<DeterministicState>
public class DeterministicState(HashSet<NondeterministicState> closure) : IEquatable<DeterministicState>
{
public Guid Id { get; } = Guid.NewGuid();
public Dictionary<char, DeterministicState> Transaction { get; } = [];
public HashSet<NondeterministicState> Closure { get; } = closure;
public bool Equals(DeterministicState? other) => other is not null && Id.Equals(other.Id);
public override bool Equals(object? obj) => obj is DeterministicState other && Equals(other);
@ -35,7 +37,7 @@ public class DeterministicFiniteAutomation
HashSet<DeterministicState> finalStates = [];
HashSet<NondeterministicState> startClosure = nfa.Start.CalculateEmptyClosure();
DeterministicState start = new();
DeterministicState start = new(startClosure);
map.Add(new NondeterministicStateSet(startClosure), start);
queue.Enqueue(new Pair(startClosure, start));
@ -50,7 +52,7 @@ public class DeterministicFiniteAutomation
foreach (NondeterministicState state in pair.States)
{
foreach (KeyValuePair<EmptyChar,HashSet<NondeterministicState>> transaction in
foreach (KeyValuePair<EmptyChar, HashSet<NondeterministicState>> transaction in
state.Transactions.Where(p => !p.Key.IsEmpty))
{
HashSet<NondeterministicState> closure = [];
@ -64,16 +66,19 @@ public class DeterministicFiniteAutomation
{
n.UnionWith(closure);
}
next.Add(transaction.Key.Char, closure);
else
{
next.Add(transaction.Key.Char, closure);
}
}
}
foreach (KeyValuePair<char,HashSet<NondeterministicState>> transaction in next)
foreach (KeyValuePair<char, HashSet<NondeterministicState>> transaction in next)
{
NondeterministicStateSet set = new(transaction.Value);
if (!map.TryGetValue(set, out DeterministicState? nextState))
{
nextState = new DeterministicState();
nextState = new DeterministicState(transaction.Value);
map.Add(set, nextState);
}

View File

@ -0,0 +1,92 @@
using System.Diagnostics.CodeAnalysis;
using CanonSharp.Common.Abstractions;
namespace CanonSharp.Common.LexicalAnalyzer;
public class LexicalScanner(
DeterministicState startState,
Dictionary<DeterministicState, LexicalToken> finalStateMap,
HashSet<LexicalToken> skippedTokens,
ISourceReader reader)
{
private readonly DeterministicState _startState = startState;
private readonly List<char> _readHistory = [];
private DeterministicState _currentState = startState;
public bool TryRead([NotNullWhen(true)] out LexicalToken? token)
{
while (TryReadInternal(out token))
{
if (!skippedTokens.Contains(token))
{
return true;
}
}
return false;
}
private bool TryReadInternal([NotNullWhen(true)] out LexicalToken? token)
{
while (reader.TryPeek(out char c))
{
if (_currentState.Transaction.TryGetValue(c, out DeterministicState? nextState))
{
// 可以迁移到下一个状态
_currentState = nextState;
_readHistory.Add(reader.Read());
}
else
{
// 无法迁移到下一个状态
if (!finalStateMap.TryGetValue(_currentState, out LexicalToken? possibleToken))
{
throw new InvalidOperationException();
}
// 当前状态是终止状态
token = new LexicalToken(possibleToken, new string(_readHistory.ToArray()));
// 重置状态
_readHistory.Clear();
_currentState = _startState;
return true;
}
}
// 当前状态是终止状态
if (finalStateMap.TryGetValue(_currentState, out LexicalToken? possibleToken2))
{
token = new LexicalToken(possibleToken2, new string(_readHistory.ToArray()));
_readHistory.Clear();
_currentState = _startState;
return true;
}
if (!_currentState.Equals(_startState))
{
throw new InvalidOperationException();
}
token = null;
return false;
}
public static LexicalScannerBuilder CreateDefaultBuilder()
{
LexicalScannerBuilder builder = new();
builder.DefineToken(LexicalToken.LineBreaker);
builder.DefineToken(LexicalToken.WhiteSpace);
builder.AddSkippedToken(LexicalToken.LineBreaker);
builder.AddSkippedToken(LexicalToken.WhiteSpace);
return builder;
}
public static LexicalScannerBuilder CreateEmptyBuilder() => new();
}

View File

@ -0,0 +1,62 @@
using CanonSharp.Common.Abstractions;
namespace CanonSharp.Common.LexicalAnalyzer;
public class LexicalScannerBuilder
{
private readonly Dictionary<NondeterministicState, LexicalToken> _finalStateMap = [];
private readonly List<NondeterministicFiniteAutomation> _nondeterministicFiniteAutomations = [];
private readonly HashSet<LexicalToken> _skippedTokens = [];
internal LexicalScannerBuilder()
{
}
public void DefineToken(LexicalToken token)
{
NondeterministicFiniteAutomation automation = token.Expression.Convert2Nfa();
_nondeterministicFiniteAutomations.Add(automation);
foreach (NondeterministicState state in automation.FinalStates)
{
_finalStateMap.Add(state, token);
}
}
public void AddSkippedToken(LexicalToken token) => _skippedTokens.Add(token);
public LexicalScanner Build(ISourceReader reader)
{
NondeterministicFiniteAutomation finaAutomation = Combine();
DeterministicFiniteAutomation deterministicFiniteAutomation =
DeterministicFiniteAutomation.Create(finaAutomation);
Dictionary<DeterministicState, LexicalToken> finalTokenMap = [];
foreach (DeterministicState state in deterministicFiniteAutomation.FinalStates)
{
finalTokenMap.Add(state, state.Closure
.Where(s => _finalStateMap.ContainsKey(s))
.Select(s => _finalStateMap[s])
.OrderByDescending(t => t.Priority)
.First());
}
return new LexicalScanner(deterministicFiniteAutomation.Start, finalTokenMap, _skippedTokens, reader);
}
private NondeterministicFiniteAutomation Combine()
{
NondeterministicState head = new();
NondeterministicFiniteAutomation result = new(head, []);
foreach (NondeterministicFiniteAutomation automation in _nondeterministicFiniteAutomations)
{
head.AddTransaction(EmptyChar.Empty, automation.Start);
result.FinalStates.UnionWith(automation.FinalStates);
}
return result;
}
}

View File

@ -0,0 +1,56 @@
using System.Globalization;
namespace CanonSharp.Common.LexicalAnalyzer;
public class LexicalToken : IEquatable<LexicalToken>
{
private readonly Guid _tokenId;
public RegularExpression Expression { get; }
public int Priority { get; }
public LexicalToken(RegularExpression expression, int priority)
{
_tokenId = Guid.NewGuid();
Expression = expression;
Priority = priority;
LiteralValue = string.Empty;
}
internal LexicalToken(LexicalToken definition, string literalValue)
{
_tokenId = definition._tokenId;
Expression = definition.Expression;
Priority = definition.Priority;
LiteralValue = literalValue;
}
public string LiteralValue { get; }
public bool Equals(LexicalToken? other) => other is not null && _tokenId == other._tokenId;
public override bool Equals(object? obj) => obj is LexicalToken other && Equals(other);
public override int GetHashCode() => _tokenId.GetHashCode();
public static bool operator ==(LexicalToken a, LexicalToken b) => a.Equals(b);
public static bool operator !=(LexicalToken a, LexicalToken b) => !(a == b);
/// <summary>
/// 匹配所有的空白字符
/// </summary>
public static readonly LexicalToken WhiteSpace = new(
RegularExpression.Alternate(
RegularExpression.CharSetOf(c => char.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator),
RegularExpression.CharSetOf("\u0009\u000B\u000C")), int.MinValue);
/// <summary>
/// 匹配所有的换行符
/// </summary>
public static readonly LexicalToken LineBreaker = new(
RegularExpression.Alternate(
RegularExpression.CharSetOf("\u000D\u000A\u0085\u2028\u2029"),
RegularExpression.String("\r\n")), int.MinValue);
}

View File

@ -4,17 +4,76 @@ public abstract class RegularExpression
{
public abstract NondeterministicFiniteAutomation Convert2Nfa();
/// <summary>
/// 匹配空字符串
/// </summary>
public static RegularExpression Empty => new EmptyExpression();
/// <summary>
/// 匹配单个字符
/// c
/// </summary>
/// <param name="c"></param>
/// <returns></returns>
public static RegularExpression Single(char c) => new SymbolExpression(c);
/// <summary>
/// left|right
/// </summary>
/// <param name="left"></param>
/// <param name="right"></param>
/// <returns></returns>
public static RegularExpression Alternate(RegularExpression left, RegularExpression right) =>
new AlternationExpression(left, right);
/// <summary>
/// left-right
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
/// <returns></returns>
public static RegularExpression Concatenate(RegularExpression first, RegularExpression second) =>
new ConcatenationExpression(first, second);
/// <summary>
/// inner*
/// </summary>
/// <param name="inner"></param>
/// <returns></returns>
public static RegularExpression Kleene(RegularExpression inner) => new KleeneExpression(inner);
/// <summary>
/// value
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
public static RegularExpression String(string value) => new StringExpression(value);
public static RegularExpression CharSetOf(string value) => new CharSetExpression(value.ToCharArray());
public static RegularExpression CharSetOf(Func<char, bool> predicate)
=> new CharSetExpression(Iterate(char.MinValue, char.MaxValue).Where(predicate).ToArray());
/// <summary>
/// [a-b]
/// </summary>
/// <param name="a"></param>
/// <param name="b"></param>
/// <returns></returns>
public static RegularExpression Range(char a, char b) => new CharSetExpression(Iterate(a, b).ToArray());
private static IEnumerable<char> Iterate(char a, char b)
{
for (char c = a; c <= b; c++)
{
if (c == char.MaxValue)
{
yield break;
}
yield return c;
}
}
}
public class EmptyExpression : RegularExpression
@ -107,3 +166,59 @@ public class KleeneExpression(RegularExpression inner) : RegularExpression
return new NondeterministicFiniteAutomation(final, [final]);
}
}
public class CharSetExpression : RegularExpression
{
public char[] Set { get; }
public CharSetExpression(Span<char> set)
{
if (set.Length == 0)
{
throw new InvalidOperationException();
}
Set = set.ToArray();
}
public override NondeterministicFiniteAutomation Convert2Nfa()
{
NondeterministicState start = new();
NondeterministicState final = new();
foreach (char c in Set)
{
start.AddTransaction(new EmptyChar(c), final);
}
return new NondeterministicFiniteAutomation(start, [final]);
}
}
public class StringExpression : RegularExpression
{
public string Word { get; }
public StringExpression(string word)
{
if (string.IsNullOrEmpty(word))
{
throw new InvalidOperationException();
}
Word = word;
}
public override NondeterministicFiniteAutomation Convert2Nfa()
{
NondeterministicState start = new();
NondeterministicState final = Word.Aggregate(start, (state, c) =>
{
NondeterministicState next = new();
state.AddTransaction(new EmptyChar(c), next);
return next;
});
return new NondeterministicFiniteAutomation(start, [final]);
}
}

View File

@ -0,0 +1,78 @@
using CanonSharp.Common.Abstractions;
namespace CanonSharp.Common.Reader;
public class SourceReader : ISourceReader
{
private readonly StreamReader _reader;
private char? _lookAhead;
public SourceReader(string filename)
{
FileInfo source = new(filename);
if (!source.Exists)
{
throw new InvalidOperationException();
}
_reader = new StreamReader(filename);
}
public char Read()
{
if (_lookAhead.HasValue)
{
char result = _lookAhead.Value;
_lookAhead = null;
return result;
}
if (!TryFetchChar(out char c))
{
throw new InvalidOperationException();
}
return c;
}
public bool TryPeek(out char c)
{
if (_lookAhead.HasValue)
{
c = _lookAhead.Value;
return true;
}
if (!TryFetchChar(out c))
{
return false;
}
_lookAhead = c;
return true;
}
private readonly char[] _buffer = new char[1024];
private int _length;
private int _count;
private bool TryFetchChar(out char c)
{
if (_length == _count)
{
_length = _reader.Read(_buffer);
_count = 0;
}
if (_length == 0)
{
c = char.MinValue;
return false;
}
c = _buffer[_count];
_count += 1;
return true;
}
}

View File

@ -0,0 +1,33 @@
using CanonSharp.Common.Abstractions;
namespace CanonSharp.Common.Reader;
public class StringReader(string source) : ISourceReader
{
private int _pos;
public char Read()
{
if (_pos >= source.Length)
{
throw new InvalidOperationException();
}
char result = source[_pos];
_pos += 1;
return result;
}
public bool TryPeek(out char c)
{
if (_pos < source.Length)
{
c = source[_pos];
return true;
}
c = char.MinValue;
return false;
}
}

View File

@ -0,0 +1,25 @@
using CanonSharp.Common.Abstractions;
using StringReader = CanonSharp.Common.Reader.StringReader;
namespace CanonSharp.Tests.LexicalAnalyzerTests;
public class ReaderTests
{
[Fact]
public void StringReaderTest()
{
StringReader reader = new("ab");
Assert.True(reader.TryPeek(out char c));
Assert.Equal('a', c);
Assert.True(reader.TryPeek(out c));
Assert.Equal('a', c);
Assert.Equal('a', reader.Read());
Assert.True(reader.TryPeek(out c));
Assert.Equal('b', c);
Assert.True(reader.TryPeek(out c));
Assert.Equal('b', c);
Assert.Equal('b', reader.Read());
Assert.False(reader.TryPeek(out c));
}
}

View File

@ -43,6 +43,15 @@ public class RegularExpressionTests
s => s.Transactions.ContainsKey(new EmptyChar('a')));
}
[Fact]
public void RangeTest()
{
RegularExpression expression = RegularExpression.Range('a', 'z');
NondeterministicFiniteAutomation nfa = expression.Convert2Nfa();
Assert.Equal(26, nfa.Start.Transactions.Count);
}
[Fact]
public void ConvertTest()
{
@ -74,4 +83,22 @@ public class RegularExpressionTests
Assert.Equal('a', map[new NondeterministicStateSet([key2, key1])]);
}
[Fact]
public void PrefixConvertTest()
{
RegularExpression expression = RegularExpression.Alternate(
RegularExpression.String("string"),
RegularExpression.String("string1"));
NondeterministicFiniteAutomation nfa = expression.Convert2Nfa();
DeterministicFiniteAutomation.Create(nfa);
}
[Fact]
public void WhiteSpaceConvertTest()
{
NondeterministicFiniteAutomation nfa = LexicalToken.WhiteSpace.Expression.Convert2Nfa();
DeterministicFiniteAutomation.Create(nfa);
}
}

View File

@ -0,0 +1,71 @@
using CanonSharp.Common.LexicalAnalyzer;
using StringReader = CanonSharp.Common.Reader.StringReader;
namespace CanonSharp.Tests.LexicalAnalyzerTests;
public class ScanTests
{
[Fact]
public void ScanTest1()
{
LexicalScannerBuilder builder = LexicalScanner.CreateEmptyBuilder();
LexicalToken token1 = new(RegularExpression.String("ab"), 1);
builder.DefineToken(token1);
StringReader reader = new("ab");
LexicalScanner scanner = builder.Build(reader);
Assert.True(scanner.TryRead(out LexicalToken? result));
Assert.Equal(token1, result);
Assert.Equal("ab", result.LiteralValue);
}
[Fact]
public void ScanTest2()
{
LexicalScannerBuilder builder = LexicalScanner.CreateDefaultBuilder();
LexicalToken stringKeyword = new(RegularExpression.String("string"), 100);
LexicalToken periodDelimiter = new(RegularExpression.Single('.'), 100);
LexicalToken semiColonDelimiter = new(RegularExpression.Single(';'), 100);
LexicalToken identifier = new(RegularExpression.Concatenate(RegularExpression.Range('a', 'z'),
RegularExpression.Kleene(RegularExpression.Range('a', 'z'))), 0);
LexicalToken assigner = new(RegularExpression.String(":="), 100);
builder.DefineToken(stringKeyword);
builder.DefineToken(periodDelimiter);
builder.DefineToken(semiColonDelimiter);
builder.DefineToken(identifier);
builder.DefineToken(assigner);
StringReader reader = new("""
string value := origin;
string abc := value.
""");
LexicalScanner scanner = builder.Build(reader);
Validate(scanner, [
stringKeyword,
identifier,
assigner,
identifier,
semiColonDelimiter,
stringKeyword,
identifier,
assigner,
identifier,
periodDelimiter
]);
Assert.False(scanner.TryRead(out _));
}
private static void Validate(LexicalScanner scanner, IEnumerable<LexicalToken> expectedTokens)
{
foreach (LexicalToken token in expectedTokens)
{
Assert.True(scanner.TryRead(out LexicalToken? outToken));
Assert.NotNull(outToken);
Assert.Equal(token, outToken);
}
}
}