diff --git a/.idea/rustic-sysy.iml b/.idea/rustic-sysy.iml index cf84ae4..bbe0a70 100644 --- a/.idea/rustic-sysy.iml +++ b/.idea/rustic-sysy.iml @@ -3,6 +3,7 @@ + diff --git a/src/lib.rs b/src/lib.rs index a33fb60..5768976 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1 @@ -mod tokenizer; \ No newline at end of file +pub mod tokenizer; \ No newline at end of file diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..197b016 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,43 @@ +use nom::IResult; +use crate::tokenizer::parser::{combine_parser, junk_parser}; + +mod parser; + +#[derive(PartialEq, Debug, Copy, Clone)] +pub enum LexicalTokenType { + Identifier, + ConstInteger(u32), + ConstFloat(f32), + Keyword, + Delimiter, + Operator, +} + +#[derive(PartialEq, Debug, Copy, Clone)] +pub struct LexicalToken<'a> { + pub token_type: LexicalTokenType, + pub literal_value: &'a str, +} + +pub fn lexical_parser(mut input: &str) -> IResult<&str, Vec> { + let mut array = vec![]; + + while !input.is_empty() { + if let Ok((i, _)) = junk_parser(input) { + input = i; + continue; + } + + if input.is_empty() { + break; + } + + let (i, token) = combine_parser(input)?; + input = i; + array.push(token); + } + + Ok((input, array)) +} + + diff --git a/src/tokenizer/parser.rs b/src/tokenizer/parser.rs new file mode 100644 index 0000000..eb0cbcf --- /dev/null +++ b/src/tokenizer/parser.rs @@ -0,0 +1,375 @@ +use crate::tokenizer::{LexicalToken, LexicalTokenType}; +use nom::branch::alt; +use nom::bytes::complete::{tag, take_until}; +use nom::character::complete::{alpha1, alphanumeric1, digit0, digit1, hex_digit0, multispace1, oct_digit0, one_of}; +use nom::combinator::{map, not, peek, recognize, value}; +use nom::multi::many0_count; +use nom::sequence::{pair, tuple}; +use nom::IResult; +use std::str::FromStr; + +fn keyword_parser(input: &str) -> IResult<&str, LexicalToken> { + map( + recognize(tuple(( + alt(( + tag("break"), + tag("const"), + tag("continue"), + tag("else"), + tag("float"), + tag("if"), + tag("int"), + tag("return"), + tag("void"), + tag("while") + )), + not( + alt((peek(alphanumeric1), tag("_"))) + ) + ))), + |x| LexicalToken { + token_type: LexicalTokenType::Keyword, + literal_value: x, + }, + )(input) +} + +fn delimiter_parser(input: &str) -> IResult<&str, LexicalToken> { + map( + alt(( + tag(","), + tag(";"), + tag("("), + tag(")"), + tag("["), + tag("]"), + tag("{"), + tag("}"), + )), + |x| LexicalToken { + token_type: LexicalTokenType::Delimiter, + literal_value: x, + }, + )(input) +} + +fn operator_parser(input: &str) -> IResult<&str, LexicalToken> { + map( + alt(( + tag(">="), + tag("<="), + tag("=="), + tag("!="), + tag("&&"), + tag("||"), + tag("="), + tag("+"), + tag("-"), + tag("!"), + tag("*"), + tag("/"), + tag("%"), + tag(">"), + tag("<"), + )), + |x| LexicalToken { + token_type: LexicalTokenType::Operator, + literal_value: x, + }, + )(input) +} + +fn identifier_parser(input: &str) -> IResult<&str, LexicalToken> { + map(recognize(pair( + alt((alpha1, tag("_"))), + many0_count(alt((alphanumeric1, tag("_")))), + )), |s| LexicalToken { + token_type: LexicalTokenType::Identifier, + literal_value: s, + })(input) +} + +fn decimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> { + map(recognize(pair( + one_of("123456789"), + digit0, + )), |x| { + let number = u32::from_str_radix(x, 10).unwrap(); + + LexicalToken { + token_type: LexicalTokenType::ConstInteger(number), + literal_value: x, + } + })(input) +} + +fn octal_integer_parser(input: &str) -> IResult<&str, LexicalToken> { + map(recognize(pair( + tag("0"), + oct_digit0, + )), |x| { + let number = u32::from_str_radix(x, 8).unwrap(); + + LexicalToken { + token_type: LexicalTokenType::ConstInteger(number), + literal_value: x, + } + })(input) +} + +fn hexadecimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> { + map(recognize(pair( + alt((tag("0x"), tag("0X"))), + hex_digit0, + )), |x: &str| { + let number = u32::from_str_radix(&x[2..], 16).unwrap(); + + LexicalToken { + token_type: LexicalTokenType::ConstInteger(number), + literal_value: x, + } + })(input) +} + +fn integer_parser(input: &str) -> IResult<&str, LexicalToken> { + alt(( + hexadecimal_integer_parser, + octal_integer_parser, + decimal_integer_parser, + ))(input) +} + +fn float_parser(input: &str) -> IResult<&str, LexicalToken> { + map(recognize(tuple(( + alt((recognize(pair(one_of("123456789"), digit0)), tag("0"))), + tag("."), + digit1 + ))), |x| { + let number = f32::from_str(x).unwrap(); + + LexicalToken { + token_type: LexicalTokenType::ConstFloat(number), + literal_value: x, + } + })(input) +} + +fn comments_parser(input: &str) -> IResult<&str, ()> { + alt(( + value((), tuple(( + tag("//"), + take_until("\n"), + tag("\n") + ))), + value((), tuple(( + tag("/*"), + take_until("*/"), + tag("*/") + ) + )) + ))(input) +} + +pub fn junk_parser(input: &str) -> IResult<&str, ()> { + alt(( + value((), multispace1), + comments_parser + ))(input) +} + +pub fn combine_parser(input: &str) -> IResult<&str, LexicalToken> { + alt(( + float_parser, + integer_parser, + keyword_parser, + identifier_parser, + delimiter_parser, + operator_parser, + ))(input) +} + + + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_parse(except: LexicalToken, actual: IResult<&str, LexicalToken>) { + let (_, token) = actual.unwrap(); + + assert_eq!(except, token); + } + + #[test] + fn keyword_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::Keyword, + literal_value: "const", + }, keyword_parser("const int a = 3;")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::Keyword, + literal_value: "return", + }, keyword_parser("return 0;")); + } + + #[test] + fn delimiter_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::Delimiter, + literal_value: "{", + }, delimiter_parser("{ int i = 3;}")); + } + + #[test] + fn operator_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::Operator, + literal_value: "!=", + }, operator_parser("!=")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::Operator, + literal_value: "!", + }, operator_parser("!")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::Operator, + literal_value: ">=", + }, operator_parser(">=")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::Operator, + literal_value: ">", + }, operator_parser("> 123")); + } + + #[test] + fn identifier_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::Identifier, + literal_value: "a", + }, identifier_parser("a = 3")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::Identifier, + literal_value: "_123", + }, identifier_parser("_123 = NULL")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::Identifier, + literal_value: "test_123", + }, identifier_parser("test_123 += 3")); + } + + #[test] + fn decimal_integer_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(100), + literal_value: "100", + }, decimal_integer_parser("100")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(56), + literal_value: "56", + }, decimal_integer_parser("56 + 44")); + } + + #[test] + fn octal_integer_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(63), + literal_value: "077", + }, octal_integer_parser("077")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(0), + literal_value: "0", + }, octal_integer_parser("0")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(0), + literal_value: "00", + }, octal_integer_parser("00")); + } + + #[test] + fn hexadecimal_integer_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(0), + literal_value: "0x0", + }, hexadecimal_integer_parser("0x0")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(0), + literal_value: "0X00", + }, hexadecimal_integer_parser("0X00")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(15), + literal_value: "0xF", + }, hexadecimal_integer_parser("0xF")); + } + + #[test] + fn integer_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(0), + literal_value: "0", + }, integer_parser("0")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(45), + literal_value: "0055", + }, integer_parser("0055")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(291), + literal_value: "0X123", + }, integer_parser("0X123")); + } + + #[test] + fn float_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstFloat(100.0), + literal_value: "100.0", + }, float_parser("100.0")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstFloat(0.5), + literal_value: "0.5", + }, float_parser("0.5")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstFloat(123.456), + literal_value: "123.456", + }, float_parser("123.456")); + } + + #[test] + fn combine_parser_test() { + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstInteger(120), + literal_value: "120", + }, combine_parser("120")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::ConstFloat(120.11), + literal_value: "120.110", + }, combine_parser("120.110")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::Keyword, + literal_value: "const", + }, combine_parser("const")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::Identifier, + literal_value: "const_number", + }, combine_parser("const_number")); + + assert_parse(LexicalToken { + token_type: LexicalTokenType::Keyword, + literal_value: "int" + }, combine_parser("int")); + assert_parse(LexicalToken { + token_type: LexicalTokenType::Identifier, + literal_value: "int_a" + }, combine_parser("int_a")); + } +} \ No newline at end of file diff --git a/tests/lexical_tests.rs b/tests/lexical_tests.rs new file mode 100644 index 0000000..2e3add0 --- /dev/null +++ b/tests/lexical_tests.rs @@ -0,0 +1,171 @@ +use rustic_sysy::tokenizer::{lexical_parser, LexicalTokenType}; +use rustic_sysy::tokenizer::LexicalTokenType::{ConstInteger, Delimiter, Identifier, Keyword, Operator}; + +fn validate_tokens(input: &'static str, tokens: Vec) { + let (_, actual_tokens) = lexical_parser(input).unwrap(); + + assert_eq!(tokens.len(), actual_tokens.len()); + + for (actual, except) in actual_tokens.iter().zip(tokens.iter()) { + assert_eq!(&actual.token_type, except, + "The literal value of actual token is {}", actual.literal_value); + } +} + +#[test] +fn main_test() { + validate_tokens("int main { return 0; }", vec![ + Keyword, + Identifier, + Delimiter, + Keyword, + ConstInteger(0), + Delimiter, + Delimiter + ]); +} + +#[test] +fn hexadecimal_test() { + validate_tokens("// test hexadecimal define +int main(){ + int a; + a = 0xf; + return a; +}", vec![ + Keyword, + Identifier, + Delimiter, + Delimiter, + Delimiter, + Keyword, + Identifier, + Delimiter, + Identifier, + Operator, + ConstInteger(15), + Delimiter, + Keyword, + Identifier, + Delimiter, + Delimiter + ]); +} + +#[test] +fn while_and_if_test() { + validate_tokens(" + // test while-if + int whileIf() { + int a; + a = 0; + int b; + b = 0; + while (a < 100) { + if (a == 5) { + b = 25; + } + else if (a == 10) { + b = 42; + } + else { + b = a * 2; + } + a = a + 1; + } + return (b); + }", vec![ + // int whileIf() { + Keyword, + Identifier, + Delimiter, + Delimiter, + Delimiter, + // int a; + Keyword, + Identifier, + Delimiter, + // a = 0; + Identifier, + Operator, + ConstInteger(0), + Delimiter, + // int b; + Keyword, + Identifier, + Delimiter, + // b = 0; + Identifier, + Operator, + ConstInteger(0), + Delimiter, + // while (a < 100) { + Keyword, + Delimiter, + Identifier, + Operator, + ConstInteger(100), + Delimiter, + Delimiter, + // if (a == 5) { + Keyword, + Delimiter, + Identifier, + Operator, + ConstInteger(5), + Delimiter, + Delimiter, + // b = 25; + Identifier, + Operator, + ConstInteger(25), + Delimiter, + // } + Delimiter, + // else if (a == 10) { + Keyword, + Keyword, + Delimiter, + Identifier, + Operator, + ConstInteger(10), + Delimiter, + Delimiter, + // b = 42; + Identifier, + Operator, + ConstInteger(42), + Delimiter, + // } + Delimiter, + // else { + Keyword, + Delimiter, + // b = a * 2; + Identifier, + Operator, + Identifier, + Operator, + ConstInteger(2), + Delimiter, + // } + Delimiter, + // a = a + 1; + Identifier, + Operator, + Identifier, + Operator, + ConstInteger(1), + Delimiter, + // } + Delimiter, + // return (b); + Keyword, + Delimiter, + Identifier, + Delimiter, + Delimiter, + // } + Delimiter + ]); +}