add: tests for zero-parser.
This commit is contained in:
133
tests/tokenizer/mod.rs
Normal file
133
tests/tokenizer/mod.rs
Normal file
@@ -0,0 +1,133 @@
|
||||
use nom::IResult;
|
||||
use std::cell::RefCell;
|
||||
use std::rc::Rc;
|
||||
use zero_parser::parser::{ParserContext, ParserResult};
|
||||
|
||||
mod nom_parsers;
|
||||
pub mod zero_parsers;
|
||||
|
||||
/// 词法令牌的类别
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum LexicalTokenType {
|
||||
Identifier,
|
||||
/// 整型常数类型
|
||||
/// 在词法分析阶段就得到常量的字面值
|
||||
ConstInteger(u32),
|
||||
/// 浮点常数类型
|
||||
ConstFloat(f32),
|
||||
Keyword,
|
||||
Delimiter,
|
||||
Operator,
|
||||
LiteralString,
|
||||
}
|
||||
|
||||
/// 词法令牌
|
||||
#[derive(PartialEq, Debug, Copy, Clone)]
|
||||
pub struct LexicalToken<'a> {
|
||||
pub token_type: LexicalTokenType,
|
||||
pub literal_value: &'a str,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug, Copy, Clone)]
|
||||
pub struct NewLexicalToken<'a> {
|
||||
pub token_type: LexicalTokenType,
|
||||
pub literal_value: &'a [char],
|
||||
}
|
||||
|
||||
/// 为词法令牌类型实现相等性判断
|
||||
/// 重写判断的原因是我们希望不同值类型的整型常量和浮点常量可以是相同的
|
||||
///
|
||||
/// 即满足如下的判断:
|
||||
/// ```
|
||||
/// use rustic_sysy::tokenizer::LexicalTokenType;
|
||||
///
|
||||
/// assert_eq!(LexicalTokenType::ConstInteger(0), LexicalTokenType::ConstInteger(2));
|
||||
/// assert_eq!(LexicalTokenType::ConstFloat(0f32), LexicalTokenType::ConstFloat(2f32));
|
||||
/// ```
|
||||
impl PartialEq for LexicalTokenType {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
match self {
|
||||
LexicalTokenType::ConstInteger(_) => match other {
|
||||
LexicalTokenType::ConstInteger(_) => true,
|
||||
_ => false,
|
||||
},
|
||||
LexicalTokenType::ConstFloat(_) => match other {
|
||||
LexicalTokenType::ConstFloat(_) => true,
|
||||
_ => false,
|
||||
},
|
||||
LexicalTokenType::LiteralString => match other {
|
||||
LexicalTokenType::LiteralString => true,
|
||||
_ => false,
|
||||
},
|
||||
LexicalTokenType::Keyword => match other {
|
||||
LexicalTokenType::Keyword => true,
|
||||
_ => false,
|
||||
},
|
||||
LexicalTokenType::Identifier => match other {
|
||||
LexicalTokenType::Identifier => true,
|
||||
_ => false,
|
||||
},
|
||||
LexicalTokenType::Delimiter => match other {
|
||||
LexicalTokenType::Delimiter => true,
|
||||
_ => false,
|
||||
},
|
||||
LexicalTokenType::Operator => match other {
|
||||
LexicalTokenType::Operator => true,
|
||||
_ => false,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> PartialEq<NewLexicalToken<'a>> for LexicalToken<'b> {
|
||||
fn eq(&self, other: &NewLexicalToken) -> bool {
|
||||
self.token_type == other.token_type
|
||||
&& self.literal_value.chars().collect::<String>()
|
||||
== other.literal_value.iter().collect::<String>()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn nom_lexical_parser(mut input: &str) -> IResult<&str, Vec<LexicalToken>> {
|
||||
let mut array = vec![];
|
||||
|
||||
while !input.is_empty() {
|
||||
if let Ok((i, _)) = nom_parsers::junk_parser(input) {
|
||||
if i.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
input = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
let (i, token) = nom_parsers::combine_parser(input)?;
|
||||
input = i;
|
||||
array.push(token);
|
||||
}
|
||||
|
||||
Ok((input, array))
|
||||
}
|
||||
|
||||
pub fn zero_lexical_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
mut input: &[char],
|
||||
) -> ParserResult<char, Vec<NewLexicalToken>> {
|
||||
let mut array = vec![];
|
||||
|
||||
while !input.is_empty() {
|
||||
if let Ok((i, _)) = zero_parsers::junk_parser(context.clone(), input) {
|
||||
if i.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
input = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
let (i, token) = zero_parsers::combine_parser(context.clone(), input)?;
|
||||
input = i;
|
||||
array.push(token);
|
||||
}
|
||||
|
||||
Ok((input, array))
|
||||
}
|
180
tests/tokenizer/nom_parsers.rs
Normal file
180
tests/tokenizer/nom_parsers.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
use crate::tokenizer::{LexicalToken, LexicalTokenType};
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::{tag, take_until};
|
||||
use nom::character::complete::{
|
||||
alpha1, alphanumeric1, digit0, digit1, hex_digit0, multispace1, oct_digit0, one_of,
|
||||
};
|
||||
use nom::combinator::{map, not, peek, recognize, value};
|
||||
use nom::multi::many0_count;
|
||||
use nom::sequence::{delimited, pair, tuple};
|
||||
use nom::IResult;
|
||||
use std::str::FromStr;
|
||||
|
||||
fn keyword_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(
|
||||
recognize(tuple((
|
||||
alt((
|
||||
tag("break"),
|
||||
tag("const"),
|
||||
tag("continue"),
|
||||
tag("else"),
|
||||
tag("float"),
|
||||
tag("if"),
|
||||
tag("int"),
|
||||
tag("return"),
|
||||
tag("void"),
|
||||
tag("while"),
|
||||
)),
|
||||
not(alt((peek(alphanumeric1), tag("_")))),
|
||||
))),
|
||||
|x| LexicalToken {
|
||||
token_type: LexicalTokenType::Keyword,
|
||||
literal_value: x,
|
||||
},
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn delimiter_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(
|
||||
alt((
|
||||
tag(","),
|
||||
tag(";"),
|
||||
tag("("),
|
||||
tag(")"),
|
||||
tag("["),
|
||||
tag("]"),
|
||||
tag("{"),
|
||||
tag("}"),
|
||||
)),
|
||||
|x| LexicalToken {
|
||||
token_type: LexicalTokenType::Delimiter,
|
||||
literal_value: x,
|
||||
},
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn operator_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(
|
||||
alt((
|
||||
tag(">="),
|
||||
tag("<="),
|
||||
tag("=="),
|
||||
tag("!="),
|
||||
tag("&&"),
|
||||
tag("||"),
|
||||
tag("="),
|
||||
tag("+"),
|
||||
tag("-"),
|
||||
tag("!"),
|
||||
tag("*"),
|
||||
tag("/"),
|
||||
tag("%"),
|
||||
tag(">"),
|
||||
tag("<"),
|
||||
)),
|
||||
|x| LexicalToken {
|
||||
token_type: LexicalTokenType::Operator,
|
||||
literal_value: x,
|
||||
},
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn identifier_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(
|
||||
recognize(pair(
|
||||
alt((alpha1, tag("_"))),
|
||||
many0_count(alt((alphanumeric1, tag("_")))),
|
||||
)),
|
||||
|s| LexicalToken {
|
||||
token_type: LexicalTokenType::Identifier,
|
||||
literal_value: s,
|
||||
},
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn decimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(recognize(pair(one_of("123456789"), digit0)), |x| {
|
||||
let number = u32::from_str_radix(x, 10).unwrap();
|
||||
|
||||
LexicalToken {
|
||||
token_type: LexicalTokenType::ConstInteger(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})(input)
|
||||
}
|
||||
|
||||
fn octal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(recognize(pair(tag("0"), oct_digit0)), |x| {
|
||||
let number = u32::from_str_radix(x, 8).unwrap();
|
||||
|
||||
LexicalToken {
|
||||
token_type: LexicalTokenType::ConstInteger(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})(input)
|
||||
}
|
||||
|
||||
fn hexadecimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(
|
||||
recognize(pair(alt((tag("0x"), tag("0X"))), hex_digit0)),
|
||||
|x: &str| {
|
||||
let number = u32::from_str_radix(&x[2..], 16).unwrap();
|
||||
|
||||
LexicalToken {
|
||||
token_type: LexicalTokenType::ConstInteger(number),
|
||||
literal_value: x,
|
||||
}
|
||||
},
|
||||
)(input)
|
||||
}
|
||||
|
||||
fn integer_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
alt((
|
||||
hexadecimal_integer_parser,
|
||||
octal_integer_parser,
|
||||
decimal_integer_parser,
|
||||
))(input)
|
||||
}
|
||||
|
||||
fn float_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(recognize(tuple((digit1, tag("."), digit1))), |x| {
|
||||
let number = f32::from_str(x).unwrap();
|
||||
|
||||
LexicalToken {
|
||||
token_type: LexicalTokenType::ConstFloat(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})(input)
|
||||
}
|
||||
|
||||
fn literal_string_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
map(delimited(tag("\""), take_until("\""), tag("\"")), |s| {
|
||||
LexicalToken {
|
||||
token_type: LexicalTokenType::LiteralString,
|
||||
literal_value: s,
|
||||
}
|
||||
})(input)
|
||||
}
|
||||
|
||||
fn comments_parser(input: &str) -> IResult<&str, ()> {
|
||||
alt((
|
||||
value((), tuple((tag("//"), take_until("\n"), tag("\n")))),
|
||||
value((), tuple((tag("/*"), take_until("*/"), tag("*/")))),
|
||||
))(input)
|
||||
}
|
||||
|
||||
pub fn junk_parser(input: &str) -> IResult<&str, ()> {
|
||||
alt((value((), multispace1), comments_parser))(input)
|
||||
}
|
||||
|
||||
pub fn combine_parser(input: &str) -> IResult<&str, LexicalToken> {
|
||||
alt((
|
||||
float_parser,
|
||||
integer_parser,
|
||||
literal_string_parser,
|
||||
keyword_parser,
|
||||
identifier_parser,
|
||||
delimiter_parser,
|
||||
operator_parser,
|
||||
))(input)
|
||||
}
|
266
tests/tokenizer/zero_parsers.rs
Normal file
266
tests/tokenizer/zero_parsers.rs
Normal file
@@ -0,0 +1,266 @@
|
||||
use crate::tokenizer::{LexicalTokenType, NewLexicalToken};
|
||||
use nom::AsChar;
|
||||
use std::cell::RefCell;
|
||||
use std::rc::Rc;
|
||||
use std::str::FromStr;
|
||||
use zero_parser::combinators::{quote, take_till, tuple, ParserExt};
|
||||
use zero_parser::parser::{any, Parser, ParserContext, ParserResult};
|
||||
use zero_parser::text::{char_parser, one_of, string_parser};
|
||||
use zero_parser::{alternate, parser::satisfy};
|
||||
|
||||
pub fn keyword_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
tuple((
|
||||
alternate!(
|
||||
string_parser("break"),
|
||||
string_parser("const"),
|
||||
string_parser("continue"),
|
||||
string_parser("else"),
|
||||
string_parser("float"),
|
||||
string_parser("if"),
|
||||
string_parser("int"),
|
||||
string_parser("return"),
|
||||
string_parser("void"),
|
||||
string_parser("while")
|
||||
),
|
||||
alternate!(satisfy(|c: &char| c.is_alphanumeric()), char_parser('_'))
|
||||
.look_ahead()
|
||||
.reverse(()),
|
||||
))
|
||||
.literal()
|
||||
.map(|x| NewLexicalToken {
|
||||
token_type: LexicalTokenType::Keyword,
|
||||
literal_value: x,
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn delimiter_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
alternate!(
|
||||
char_parser(','),
|
||||
char_parser(';'),
|
||||
char_parser('('),
|
||||
char_parser(')'),
|
||||
char_parser('['),
|
||||
char_parser(']'),
|
||||
char_parser('{'),
|
||||
char_parser('}')
|
||||
)
|
||||
.literal()
|
||||
.map(|x| NewLexicalToken {
|
||||
token_type: LexicalTokenType::Delimiter,
|
||||
literal_value: x,
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn operator_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
alternate!(
|
||||
string_parser(">="),
|
||||
string_parser("<="),
|
||||
string_parser("=="),
|
||||
string_parser("!="),
|
||||
string_parser("&&"),
|
||||
string_parser("||"),
|
||||
string_parser("="),
|
||||
string_parser("+"),
|
||||
string_parser("-"),
|
||||
string_parser("!"),
|
||||
string_parser("*"),
|
||||
string_parser("/"),
|
||||
string_parser("%"),
|
||||
string_parser(">"),
|
||||
string_parser("<")
|
||||
)
|
||||
.literal()
|
||||
.map(|x| NewLexicalToken {
|
||||
token_type: LexicalTokenType::Operator,
|
||||
literal_value: x,
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn identifier_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
tuple((
|
||||
alternate!(satisfy(|c: &char| c.is_alphabetic()), char_parser('_')),
|
||||
alternate!(satisfy(|c: &char| c.is_alphanumeric()), char_parser('_')).many(),
|
||||
))
|
||||
.literal()
|
||||
.map(|x| NewLexicalToken {
|
||||
token_type: LexicalTokenType::Identifier,
|
||||
literal_value: x,
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn decimal_integer_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
tuple((
|
||||
one_of("123456789"),
|
||||
satisfy(|c: &char| c.is_ascii_digit()).many(),
|
||||
))
|
||||
.literal()
|
||||
.map(|x| {
|
||||
let word: String = x.iter().map(|x| x.clone()).collect();
|
||||
let number = u32::from_str_radix(word.as_str(), 10).unwrap();
|
||||
|
||||
NewLexicalToken {
|
||||
token_type: LexicalTokenType::ConstInteger(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn octal_integer_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
tuple((
|
||||
char_parser('0'),
|
||||
satisfy(|c: &char| c.is_oct_digit()).many(),
|
||||
))
|
||||
.literal()
|
||||
.map(|x| {
|
||||
let word: String = x.iter().collect();
|
||||
let number = u32::from_str_radix(word.as_str(), 8).unwrap();
|
||||
|
||||
NewLexicalToken {
|
||||
token_type: LexicalTokenType::ConstInteger(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn hexadecimal_integer_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
tuple((
|
||||
alternate!(string_parser("0x"), string_parser("0X")),
|
||||
satisfy(|c: &char| c.is_hex_digit()).many(),
|
||||
))
|
||||
.literal()
|
||||
.map(|x| {
|
||||
let word: String = (&x[2..]).iter().collect();
|
||||
let number = u32::from_str_radix(word.as_str(), 16).unwrap();
|
||||
|
||||
NewLexicalToken {
|
||||
token_type: LexicalTokenType::ConstInteger(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn integer_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
alternate!(
|
||||
hexadecimal_integer_parser,
|
||||
octal_integer_parser,
|
||||
decimal_integer_parser
|
||||
)
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn float_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
tuple((
|
||||
satisfy(|c: &char| c.is_ascii_digit()).many1(),
|
||||
char_parser('.'),
|
||||
satisfy(|c: &char| c.is_ascii_digit()).many1(),
|
||||
))
|
||||
.literal()
|
||||
.map(|x| {
|
||||
let word: String = x.iter().map(|c| c.clone()).collect();
|
||||
let number = f32::from_str(word.as_str()).unwrap();
|
||||
|
||||
NewLexicalToken {
|
||||
token_type: LexicalTokenType::ConstFloat(number),
|
||||
literal_value: x,
|
||||
}
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn literal_string_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
quote(char_parser('"'), any(), char_parser('"'))
|
||||
.literal()
|
||||
.map(|x| {
|
||||
let length = x.len();
|
||||
NewLexicalToken {
|
||||
token_type: LexicalTokenType::LiteralString,
|
||||
literal_value: &x[1..length],
|
||||
}
|
||||
})
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn comments_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, ()> {
|
||||
alternate!(
|
||||
tuple((
|
||||
string_parser("//"),
|
||||
take_till(char_parser('\n')),
|
||||
char_parser('\n')
|
||||
))
|
||||
.map(|_| ()),
|
||||
tuple((
|
||||
string_parser("/*"),
|
||||
take_till(string_parser("*/")),
|
||||
string_parser("*/")
|
||||
))
|
||||
.map(|_| ())
|
||||
)
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn junk_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, ()> {
|
||||
alternate!(
|
||||
comments_parser,
|
||||
satisfy(|c: &char| c.is_whitespace()).many1().map(|_| ())
|
||||
)
|
||||
.parse(context, input)
|
||||
}
|
||||
|
||||
pub fn combine_parser(
|
||||
context: Rc<RefCell<ParserContext<char, ()>>>,
|
||||
input: &[char],
|
||||
) -> ParserResult<char, NewLexicalToken> {
|
||||
alternate!(
|
||||
float_parser,
|
||||
integer_parser,
|
||||
literal_string_parser,
|
||||
keyword_parser,
|
||||
identifier_parser,
|
||||
delimiter_parser,
|
||||
operator_parser
|
||||
)
|
||||
.parse(context, input)
|
||||
}
|
Reference in New Issue
Block a user