add: lexical parser.

This commit is contained in:
jackfiled 2024-08-27 18:08:18 +08:00
parent 33570a9a25
commit 852dbc824c
5 changed files with 591 additions and 1 deletions

View File

@ -3,6 +3,7 @@
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />

View File

@ -1 +1 @@
mod tokenizer;
pub mod tokenizer;

43
src/tokenizer.rs Normal file
View File

@ -0,0 +1,43 @@
use nom::IResult;
use crate::tokenizer::parser::{combine_parser, junk_parser};
mod parser;
#[derive(PartialEq, Debug, Copy, Clone)]
pub enum LexicalTokenType {
Identifier,
ConstInteger(u32),
ConstFloat(f32),
Keyword,
Delimiter,
Operator,
}
#[derive(PartialEq, Debug, Copy, Clone)]
pub struct LexicalToken<'a> {
pub token_type: LexicalTokenType,
pub literal_value: &'a str,
}
pub fn lexical_parser(mut input: &str) -> IResult<&str, Vec<LexicalToken>> {
let mut array = vec![];
while !input.is_empty() {
if let Ok((i, _)) = junk_parser(input) {
input = i;
continue;
}
if input.is_empty() {
break;
}
let (i, token) = combine_parser(input)?;
input = i;
array.push(token);
}
Ok((input, array))
}

375
src/tokenizer/parser.rs Normal file
View File

@ -0,0 +1,375 @@
use crate::tokenizer::{LexicalToken, LexicalTokenType};
use nom::branch::alt;
use nom::bytes::complete::{tag, take_until};
use nom::character::complete::{alpha1, alphanumeric1, digit0, digit1, hex_digit0, multispace1, oct_digit0, one_of};
use nom::combinator::{map, not, peek, recognize, value};
use nom::multi::many0_count;
use nom::sequence::{pair, tuple};
use nom::IResult;
use std::str::FromStr;
fn keyword_parser(input: &str) -> IResult<&str, LexicalToken> {
map(
recognize(tuple((
alt((
tag("break"),
tag("const"),
tag("continue"),
tag("else"),
tag("float"),
tag("if"),
tag("int"),
tag("return"),
tag("void"),
tag("while")
)),
not(
alt((peek(alphanumeric1), tag("_")))
)
))),
|x| LexicalToken {
token_type: LexicalTokenType::Keyword,
literal_value: x,
},
)(input)
}
fn delimiter_parser(input: &str) -> IResult<&str, LexicalToken> {
map(
alt((
tag(","),
tag(";"),
tag("("),
tag(")"),
tag("["),
tag("]"),
tag("{"),
tag("}"),
)),
|x| LexicalToken {
token_type: LexicalTokenType::Delimiter,
literal_value: x,
},
)(input)
}
fn operator_parser(input: &str) -> IResult<&str, LexicalToken> {
map(
alt((
tag(">="),
tag("<="),
tag("=="),
tag("!="),
tag("&&"),
tag("||"),
tag("="),
tag("+"),
tag("-"),
tag("!"),
tag("*"),
tag("/"),
tag("%"),
tag(">"),
tag("<"),
)),
|x| LexicalToken {
token_type: LexicalTokenType::Operator,
literal_value: x,
},
)(input)
}
fn identifier_parser(input: &str) -> IResult<&str, LexicalToken> {
map(recognize(pair(
alt((alpha1, tag("_"))),
many0_count(alt((alphanumeric1, tag("_")))),
)), |s| LexicalToken {
token_type: LexicalTokenType::Identifier,
literal_value: s,
})(input)
}
fn decimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
map(recognize(pair(
one_of("123456789"),
digit0,
)), |x| {
let number = u32::from_str_radix(x, 10).unwrap();
LexicalToken {
token_type: LexicalTokenType::ConstInteger(number),
literal_value: x,
}
})(input)
}
fn octal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
map(recognize(pair(
tag("0"),
oct_digit0,
)), |x| {
let number = u32::from_str_radix(x, 8).unwrap();
LexicalToken {
token_type: LexicalTokenType::ConstInteger(number),
literal_value: x,
}
})(input)
}
fn hexadecimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
map(recognize(pair(
alt((tag("0x"), tag("0X"))),
hex_digit0,
)), |x: &str| {
let number = u32::from_str_radix(&x[2..], 16).unwrap();
LexicalToken {
token_type: LexicalTokenType::ConstInteger(number),
literal_value: x,
}
})(input)
}
fn integer_parser(input: &str) -> IResult<&str, LexicalToken> {
alt((
hexadecimal_integer_parser,
octal_integer_parser,
decimal_integer_parser,
))(input)
}
fn float_parser(input: &str) -> IResult<&str, LexicalToken> {
map(recognize(tuple((
alt((recognize(pair(one_of("123456789"), digit0)), tag("0"))),
tag("."),
digit1
))), |x| {
let number = f32::from_str(x).unwrap();
LexicalToken {
token_type: LexicalTokenType::ConstFloat(number),
literal_value: x,
}
})(input)
}
fn comments_parser(input: &str) -> IResult<&str, ()> {
alt((
value((), tuple((
tag("//"),
take_until("\n"),
tag("\n")
))),
value((), tuple((
tag("/*"),
take_until("*/"),
tag("*/")
)
))
))(input)
}
pub fn junk_parser(input: &str) -> IResult<&str, ()> {
alt((
value((), multispace1),
comments_parser
))(input)
}
pub fn combine_parser(input: &str) -> IResult<&str, LexicalToken> {
alt((
float_parser,
integer_parser,
keyword_parser,
identifier_parser,
delimiter_parser,
operator_parser,
))(input)
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_parse(except: LexicalToken, actual: IResult<&str, LexicalToken>) {
let (_, token) = actual.unwrap();
assert_eq!(except, token);
}
#[test]
fn keyword_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::Keyword,
literal_value: "const",
}, keyword_parser("const int a = 3;"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Keyword,
literal_value: "return",
}, keyword_parser("return 0;"));
}
#[test]
fn delimiter_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::Delimiter,
literal_value: "{",
}, delimiter_parser("{ int i = 3;}"));
}
#[test]
fn operator_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::Operator,
literal_value: "!=",
}, operator_parser("!="));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Operator,
literal_value: "!",
}, operator_parser("!"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Operator,
literal_value: ">=",
}, operator_parser(">="));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Operator,
literal_value: ">",
}, operator_parser("> 123"));
}
#[test]
fn identifier_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::Identifier,
literal_value: "a",
}, identifier_parser("a = 3"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Identifier,
literal_value: "_123",
}, identifier_parser("_123 = NULL"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Identifier,
literal_value: "test_123",
}, identifier_parser("test_123 += 3"));
}
#[test]
fn decimal_integer_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(100),
literal_value: "100",
}, decimal_integer_parser("100"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(56),
literal_value: "56",
}, decimal_integer_parser("56 + 44"));
}
#[test]
fn octal_integer_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(63),
literal_value: "077",
}, octal_integer_parser("077"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(0),
literal_value: "0",
}, octal_integer_parser("0"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(0),
literal_value: "00",
}, octal_integer_parser("00"));
}
#[test]
fn hexadecimal_integer_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(0),
literal_value: "0x0",
}, hexadecimal_integer_parser("0x0"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(0),
literal_value: "0X00",
}, hexadecimal_integer_parser("0X00"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(15),
literal_value: "0xF",
}, hexadecimal_integer_parser("0xF"));
}
#[test]
fn integer_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(0),
literal_value: "0",
}, integer_parser("0"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(45),
literal_value: "0055",
}, integer_parser("0055"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(291),
literal_value: "0X123",
}, integer_parser("0X123"));
}
#[test]
fn float_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstFloat(100.0),
literal_value: "100.0",
}, float_parser("100.0"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstFloat(0.5),
literal_value: "0.5",
}, float_parser("0.5"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstFloat(123.456),
literal_value: "123.456",
}, float_parser("123.456"));
}
#[test]
fn combine_parser_test() {
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstInteger(120),
literal_value: "120",
}, combine_parser("120"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::ConstFloat(120.11),
literal_value: "120.110",
}, combine_parser("120.110"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Keyword,
literal_value: "const",
}, combine_parser("const"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Identifier,
literal_value: "const_number",
}, combine_parser("const_number"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Keyword,
literal_value: "int"
}, combine_parser("int"));
assert_parse(LexicalToken {
token_type: LexicalTokenType::Identifier,
literal_value: "int_a"
}, combine_parser("int_a"));
}
}

171
tests/lexical_tests.rs Normal file
View File

@ -0,0 +1,171 @@
use rustic_sysy::tokenizer::{lexical_parser, LexicalTokenType};
use rustic_sysy::tokenizer::LexicalTokenType::{ConstInteger, Delimiter, Identifier, Keyword, Operator};
fn validate_tokens(input: &'static str, tokens: Vec<LexicalTokenType>) {
let (_, actual_tokens) = lexical_parser(input).unwrap();
assert_eq!(tokens.len(), actual_tokens.len());
for (actual, except) in actual_tokens.iter().zip(tokens.iter()) {
assert_eq!(&actual.token_type, except,
"The literal value of actual token is {}", actual.literal_value);
}
}
#[test]
fn main_test() {
validate_tokens("int main { return 0; }", vec![
Keyword,
Identifier,
Delimiter,
Keyword,
ConstInteger(0),
Delimiter,
Delimiter
]);
}
#[test]
fn hexadecimal_test() {
validate_tokens("// test hexadecimal define
int main(){
int a;
a = 0xf;
return a;
}", vec![
Keyword,
Identifier,
Delimiter,
Delimiter,
Delimiter,
Keyword,
Identifier,
Delimiter,
Identifier,
Operator,
ConstInteger(15),
Delimiter,
Keyword,
Identifier,
Delimiter,
Delimiter
]);
}
#[test]
fn while_and_if_test() {
validate_tokens("
// test while-if
int whileIf() {
int a;
a = 0;
int b;
b = 0;
while (a < 100) {
if (a == 5) {
b = 25;
}
else if (a == 10) {
b = 42;
}
else {
b = a * 2;
}
a = a + 1;
}
return (b);
}", vec![
// int whileIf() {
Keyword,
Identifier,
Delimiter,
Delimiter,
Delimiter,
// int a;
Keyword,
Identifier,
Delimiter,
// a = 0;
Identifier,
Operator,
ConstInteger(0),
Delimiter,
// int b;
Keyword,
Identifier,
Delimiter,
// b = 0;
Identifier,
Operator,
ConstInteger(0),
Delimiter,
// while (a < 100) {
Keyword,
Delimiter,
Identifier,
Operator,
ConstInteger(100),
Delimiter,
Delimiter,
// if (a == 5) {
Keyword,
Delimiter,
Identifier,
Operator,
ConstInteger(5),
Delimiter,
Delimiter,
// b = 25;
Identifier,
Operator,
ConstInteger(25),
Delimiter,
// }
Delimiter,
// else if (a == 10) {
Keyword,
Keyword,
Delimiter,
Identifier,
Operator,
ConstInteger(10),
Delimiter,
Delimiter,
// b = 42;
Identifier,
Operator,
ConstInteger(42),
Delimiter,
// }
Delimiter,
// else {
Keyword,
Delimiter,
// b = a * 2;
Identifier,
Operator,
Identifier,
Operator,
ConstInteger(2),
Delimiter,
// }
Delimiter,
// a = a + 1;
Identifier,
Operator,
Identifier,
Operator,
ConstInteger(1),
Delimiter,
// }
Delimiter,
// return (b);
Keyword,
Delimiter,
Identifier,
Delimiter,
Delimiter,
// }
Delimiter
]);
}