add: tests for zero-parser.

2024-11-19 19:08:05 +08:00
parent a282ad8f24
commit 91c6c42a02
227 changed files with 7797 additions and 0 deletions
--- a/tests/tokenizer/mod.rs
+++ b/tests/tokenizer/mod.rs
@@ -0,0 +1,133 @@
+use nom::IResult;
+use std::cell::RefCell;
+use std::rc::Rc;
+use zero_parser::parser::{ParserContext, ParserResult};
+
+mod nom_parsers;
+pub mod zero_parsers;
+
+/// 词法令牌的类别
+#[derive(Debug, Copy, Clone)]
+pub enum LexicalTokenType {
+    Identifier,
+    /// 整型常数类型
+    /// 在词法分析阶段就得到常量的字面值
+    ConstInteger(u32),
+    /// 浮点常数类型
+    ConstFloat(f32),
+    Keyword,
+    Delimiter,
+    Operator,
+    LiteralString,
+}
+
+/// 词法令牌
+#[derive(PartialEq, Debug, Copy, Clone)]
+pub struct LexicalToken<'a> {
+    pub token_type: LexicalTokenType,
+    pub literal_value: &'a str,
+}
+
+#[derive(PartialEq, Debug, Copy, Clone)]
+pub struct NewLexicalToken<'a> {
+    pub token_type: LexicalTokenType,
+    pub literal_value: &'a [char],
+}
+
+/// 为词法令牌类型实现相等性判断
+/// 重写判断的原因是我们希望不同值类型的整型常量和浮点常量可以是相同的
+///
+/// 即满足如下的判断：
+/// ```
+/// use rustic_sysy::tokenizer::LexicalTokenType;
+///
+/// assert_eq!(LexicalTokenType::ConstInteger(0), LexicalTokenType::ConstInteger(2));
+/// assert_eq!(LexicalTokenType::ConstFloat(0f32), LexicalTokenType::ConstFloat(2f32));
+/// ```
+impl PartialEq for LexicalTokenType {
+    fn eq(&self, other: &Self) -> bool {
+        match self {
+            LexicalTokenType::ConstInteger(_) => match other {
+                LexicalTokenType::ConstInteger(_) => true,
+                _ => false,
+            },
+            LexicalTokenType::ConstFloat(_) => match other {
+                LexicalTokenType::ConstFloat(_) => true,
+                _ => false,
+            },
+            LexicalTokenType::LiteralString => match other {
+                LexicalTokenType::LiteralString => true,
+                _ => false,
+            },
+            LexicalTokenType::Keyword => match other {
+                LexicalTokenType::Keyword => true,
+                _ => false,
+            },
+            LexicalTokenType::Identifier => match other {
+                LexicalTokenType::Identifier => true,
+                _ => false,
+            },
+            LexicalTokenType::Delimiter => match other {
+                LexicalTokenType::Delimiter => true,
+                _ => false,
+            },
+            LexicalTokenType::Operator => match other {
+                LexicalTokenType::Operator => true,
+                _ => false,
+            },
+        }
+    }
+}
+
+impl<'a, 'b> PartialEq<NewLexicalToken<'a>> for LexicalToken<'b> {
+    fn eq(&self, other: &NewLexicalToken) -> bool {
+        self.token_type == other.token_type
+            && self.literal_value.chars().collect::<String>()
+                == other.literal_value.iter().collect::<String>()
+    }
+}
+
+pub fn nom_lexical_parser(mut input: &str) -> IResult<&str, Vec<LexicalToken>> {
+    let mut array = vec![];
+
+    while !input.is_empty() {
+        if let Ok((i, _)) = nom_parsers::junk_parser(input) {
+            if i.is_empty() {
+                break;
+            }
+
+            input = i;
+            continue;
+        }
+
+        let (i, token) = nom_parsers::combine_parser(input)?;
+        input = i;
+        array.push(token);
+    }
+
+    Ok((input, array))
+}
+
+pub fn zero_lexical_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    mut input: &[char],
+) -> ParserResult<char, Vec<NewLexicalToken>> {
+    let mut array = vec![];
+
+    while !input.is_empty() {
+        if let Ok((i, _)) = zero_parsers::junk_parser(context.clone(), input) {
+            if i.is_empty() {
+                break;
+            }
+
+            input = i;
+            continue;
+        }
+
+        let (i, token) = zero_parsers::combine_parser(context.clone(), input)?;
+        input = i;
+        array.push(token);
+    }
+
+    Ok((input, array))
+}
--- a/tests/tokenizer/nom_parsers.rs
+++ b/tests/tokenizer/nom_parsers.rs
@@ -0,0 +1,180 @@
+use crate::tokenizer::{LexicalToken, LexicalTokenType};
+use nom::branch::alt;
+use nom::bytes::complete::{tag, take_until};
+use nom::character::complete::{
+    alpha1, alphanumeric1, digit0, digit1, hex_digit0, multispace1, oct_digit0, one_of,
+};
+use nom::combinator::{map, not, peek, recognize, value};
+use nom::multi::many0_count;
+use nom::sequence::{delimited, pair, tuple};
+use nom::IResult;
+use std::str::FromStr;
+
+fn keyword_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(
+        recognize(tuple((
+            alt((
+                tag("break"),
+                tag("const"),
+                tag("continue"),
+                tag("else"),
+                tag("float"),
+                tag("if"),
+                tag("int"),
+                tag("return"),
+                tag("void"),
+                tag("while"),
+            )),
+            not(alt((peek(alphanumeric1), tag("_")))),
+        ))),
+        |x| LexicalToken {
+            token_type: LexicalTokenType::Keyword,
+            literal_value: x,
+        },
+    )(input)
+}
+
+fn delimiter_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(
+        alt((
+            tag(","),
+            tag(";"),
+            tag("("),
+            tag(")"),
+            tag("["),
+            tag("]"),
+            tag("{"),
+            tag("}"),
+        )),
+        |x| LexicalToken {
+            token_type: LexicalTokenType::Delimiter,
+            literal_value: x,
+        },
+    )(input)
+}
+
+fn operator_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(
+        alt((
+            tag(">="),
+            tag("<="),
+            tag("=="),
+            tag("!="),
+            tag("&&"),
+            tag("||"),
+            tag("="),
+            tag("+"),
+            tag("-"),
+            tag("!"),
+            tag("*"),
+            tag("/"),
+            tag("%"),
+            tag(">"),
+            tag("<"),
+        )),
+        |x| LexicalToken {
+            token_type: LexicalTokenType::Operator,
+            literal_value: x,
+        },
+    )(input)
+}
+
+fn identifier_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(
+        recognize(pair(
+            alt((alpha1, tag("_"))),
+            many0_count(alt((alphanumeric1, tag("_")))),
+        )),
+        |s| LexicalToken {
+            token_type: LexicalTokenType::Identifier,
+            literal_value: s,
+        },
+    )(input)
+}
+
+fn decimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(recognize(pair(one_of("123456789"), digit0)), |x| {
+        let number = u32::from_str_radix(x, 10).unwrap();
+
+        LexicalToken {
+            token_type: LexicalTokenType::ConstInteger(number),
+            literal_value: x,
+        }
+    })(input)
+}
+
+fn octal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(recognize(pair(tag("0"), oct_digit0)), |x| {
+        let number = u32::from_str_radix(x, 8).unwrap();
+
+        LexicalToken {
+            token_type: LexicalTokenType::ConstInteger(number),
+            literal_value: x,
+        }
+    })(input)
+}
+
+fn hexadecimal_integer_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(
+        recognize(pair(alt((tag("0x"), tag("0X"))), hex_digit0)),
+        |x: &str| {
+            let number = u32::from_str_radix(&x[2..], 16).unwrap();
+
+            LexicalToken {
+                token_type: LexicalTokenType::ConstInteger(number),
+                literal_value: x,
+            }
+        },
+    )(input)
+}
+
+fn integer_parser(input: &str) -> IResult<&str, LexicalToken> {
+    alt((
+        hexadecimal_integer_parser,
+        octal_integer_parser,
+        decimal_integer_parser,
+    ))(input)
+}
+
+fn float_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(recognize(tuple((digit1, tag("."), digit1))), |x| {
+        let number = f32::from_str(x).unwrap();
+
+        LexicalToken {
+            token_type: LexicalTokenType::ConstFloat(number),
+            literal_value: x,
+        }
+    })(input)
+}
+
+fn literal_string_parser(input: &str) -> IResult<&str, LexicalToken> {
+    map(delimited(tag("\""), take_until("\""), tag("\"")), |s| {
+        LexicalToken {
+            token_type: LexicalTokenType::LiteralString,
+            literal_value: s,
+        }
+    })(input)
+}
+
+fn comments_parser(input: &str) -> IResult<&str, ()> {
+    alt((
+        value((), tuple((tag("//"), take_until("\n"), tag("\n")))),
+        value((), tuple((tag("/*"), take_until("*/"), tag("*/")))),
+    ))(input)
+}
+
+pub fn junk_parser(input: &str) -> IResult<&str, ()> {
+    alt((value((), multispace1), comments_parser))(input)
+}
+
+pub fn combine_parser(input: &str) -> IResult<&str, LexicalToken> {
+    alt((
+        float_parser,
+        integer_parser,
+        literal_string_parser,
+        keyword_parser,
+        identifier_parser,
+        delimiter_parser,
+        operator_parser,
+    ))(input)
+}
--- a/tests/tokenizer/zero_parsers.rs
+++ b/tests/tokenizer/zero_parsers.rs
@@ -0,0 +1,266 @@
+use crate::tokenizer::{LexicalTokenType, NewLexicalToken};
+use nom::AsChar;
+use std::cell::RefCell;
+use std::rc::Rc;
+use std::str::FromStr;
+use zero_parser::combinators::{quote, take_till, tuple, ParserExt};
+use zero_parser::parser::{any, Parser, ParserContext, ParserResult};
+use zero_parser::text::{char_parser, one_of, string_parser};
+use zero_parser::{alternate, parser::satisfy};
+
+pub fn keyword_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    tuple((
+        alternate!(
+            string_parser("break"),
+            string_parser("const"),
+            string_parser("continue"),
+            string_parser("else"),
+            string_parser("float"),
+            string_parser("if"),
+            string_parser("int"),
+            string_parser("return"),
+            string_parser("void"),
+            string_parser("while")
+        ),
+        alternate!(satisfy(|c: &char| c.is_alphanumeric()), char_parser('_'))
+            .look_ahead()
+            .reverse(()),
+    ))
+    .literal()
+    .map(|x| NewLexicalToken {
+        token_type: LexicalTokenType::Keyword,
+        literal_value: x,
+    })
+    .parse(context, input)
+}
+
+pub fn delimiter_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    alternate!(
+        char_parser(','),
+        char_parser(';'),
+        char_parser('('),
+        char_parser(')'),
+        char_parser('['),
+        char_parser(']'),
+        char_parser('{'),
+        char_parser('}')
+    )
+    .literal()
+    .map(|x| NewLexicalToken {
+        token_type: LexicalTokenType::Delimiter,
+        literal_value: x,
+    })
+    .parse(context, input)
+}
+
+pub fn operator_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    alternate!(
+        string_parser(">="),
+        string_parser("<="),
+        string_parser("=="),
+        string_parser("!="),
+        string_parser("&&"),
+        string_parser("||"),
+        string_parser("="),
+        string_parser("+"),
+        string_parser("-"),
+        string_parser("!"),
+        string_parser("*"),
+        string_parser("/"),
+        string_parser("%"),
+        string_parser(">"),
+        string_parser("<")
+    )
+    .literal()
+    .map(|x| NewLexicalToken {
+        token_type: LexicalTokenType::Operator,
+        literal_value: x,
+    })
+    .parse(context, input)
+}
+
+pub fn identifier_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    tuple((
+        alternate!(satisfy(|c: &char| c.is_alphabetic()), char_parser('_')),
+        alternate!(satisfy(|c: &char| c.is_alphanumeric()), char_parser('_')).many(),
+    ))
+    .literal()
+    .map(|x| NewLexicalToken {
+        token_type: LexicalTokenType::Identifier,
+        literal_value: x,
+    })
+    .parse(context, input)
+}
+
+pub fn decimal_integer_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    tuple((
+        one_of("123456789"),
+        satisfy(|c: &char| c.is_ascii_digit()).many(),
+    ))
+    .literal()
+    .map(|x| {
+        let word: String = x.iter().map(|x| x.clone()).collect();
+        let number = u32::from_str_radix(word.as_str(), 10).unwrap();
+
+        NewLexicalToken {
+            token_type: LexicalTokenType::ConstInteger(number),
+            literal_value: x,
+        }
+    })
+    .parse(context, input)
+}
+
+pub fn octal_integer_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    tuple((
+        char_parser('0'),
+        satisfy(|c: &char| c.is_oct_digit()).many(),
+    ))
+    .literal()
+    .map(|x| {
+        let word: String = x.iter().collect();
+        let number = u32::from_str_radix(word.as_str(), 8).unwrap();
+
+        NewLexicalToken {
+            token_type: LexicalTokenType::ConstInteger(number),
+            literal_value: x,
+        }
+    })
+    .parse(context, input)
+}
+
+pub fn hexadecimal_integer_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    tuple((
+        alternate!(string_parser("0x"), string_parser("0X")),
+        satisfy(|c: &char| c.is_hex_digit()).many(),
+    ))
+    .literal()
+    .map(|x| {
+        let word: String = (&x[2..]).iter().collect();
+        let number = u32::from_str_radix(word.as_str(), 16).unwrap();
+
+        NewLexicalToken {
+            token_type: LexicalTokenType::ConstInteger(number),
+            literal_value: x,
+        }
+    })
+    .parse(context, input)
+}
+
+pub fn integer_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    alternate!(
+        hexadecimal_integer_parser,
+        octal_integer_parser,
+        decimal_integer_parser
+    )
+    .parse(context, input)
+}
+
+pub fn float_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    tuple((
+        satisfy(|c: &char| c.is_ascii_digit()).many1(),
+        char_parser('.'),
+        satisfy(|c: &char| c.is_ascii_digit()).many1(),
+    ))
+    .literal()
+    .map(|x| {
+        let word: String = x.iter().map(|c| c.clone()).collect();
+        let number = f32::from_str(word.as_str()).unwrap();
+
+        NewLexicalToken {
+            token_type: LexicalTokenType::ConstFloat(number),
+            literal_value: x,
+        }
+    })
+    .parse(context, input)
+}
+
+pub fn literal_string_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    quote(char_parser('"'), any(), char_parser('"'))
+        .literal()
+        .map(|x| {
+            let length = x.len();
+            NewLexicalToken {
+                token_type: LexicalTokenType::LiteralString,
+                literal_value: &x[1..length],
+            }
+        })
+        .parse(context, input)
+}
+
+pub fn comments_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, ()> {
+    alternate!(
+        tuple((
+            string_parser("//"),
+            take_till(char_parser('\n')),
+            char_parser('\n')
+        ))
+        .map(|_| ()),
+        tuple((
+            string_parser("/*"),
+            take_till(string_parser("*/")),
+            string_parser("*/")
+        ))
+        .map(|_| ())
+    )
+    .parse(context, input)
+}
+
+pub fn junk_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, ()> {
+    alternate!(
+        comments_parser,
+        satisfy(|c: &char| c.is_whitespace()).many1().map(|_| ())
+    )
+    .parse(context, input)
+}
+
+pub fn combine_parser(
+    context: Rc<RefCell<ParserContext<char, ()>>>,
+    input: &[char],
+) -> ParserResult<char, NewLexicalToken> {
+    alternate!(
+        float_parser,
+        integer_parser,
+        literal_string_parser,
+        keyword_parser,
+        identifier_parser,
+        delimiter_parser,
+        operator_parser
+    )
+    .parse(context, input)
+}