RubbishBin/LexicalParser/LexicalParser.cpp

#include <cstdio>
#include <list>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <string>

class LexicalParser
{
public:
    int LineCount = 0;
    int KeywordCount = 0;
    int IdentifierCount = 0;
    int OperatorCount = 0;
    int DelimiterCount = 0;
    int CharCount = 0;
    int StringCount = 0;
    int NumberCount = 0;
    int ErrorCount = 0;

    explicit LexicalParser(FILE *file)
    {
        this->File = file;
    }

    void Loop()
    {
        bool mark = false;

        while (Readline())
        {
            while (!Buffer.empty())
            {
                if (mark)
                {
                    // 多行注释
                    for (auto i = Buffer.begin(); i != Buffer.end(); i++)
                    {
                        if (*i == '*')
                        {
                            i++;
                            if (i != Buffer.end() and *i == '/')
                            {
                                mark = false;

                                i++;
                                if (i == Buffer.end())
                                {
                                    Buffer.clear();
                                }
                                else
                                {
                                    Buffer.erase(Buffer.begin(), i);
                                }
                                break;
                            }
                        }
                    }

                    if (mark)
                    {
                        // 说明在这行没有找到结束符
                        Buffer.clear();
                        continue;
                    }

                    // 这行读取完成了
                    if (Buffer.empty())
                    {
                        continue;
                    }
                }


                auto pos = Buffer.begin();
                if (*pos == '/')
                {
                    pos++;
                    if (pos != Buffer.end())
                    {
                        // 判断单行注释
                        if (*pos == '/')
                        {
                            Buffer.clear();
                            continue;
                        }
                        else if (*pos == '*')
                        {
                            mark = true;
                            pos++;
                            Buffer.erase(Buffer.begin(), pos);
                            continue;
                        }
                    }
                }

                if (*Buffer.begin() == ' ' or *Buffer.begin() == '\t')
                {
                    Buffer.pop_front();
                    continue;
                }

                // 处理特殊错误 @
                if (*Buffer.begin() == '@')
                {
                    Buffer.pop_front();

                    ErrorCount++;
                    printf("%d <ERROR,@>\n", LineCount);
                    continue;
                }

                if (!Parse())
                {
                    return;
                }
            }
        }
    }


private:
    std::list<char> Buffer;
    FILE *File;

    bool Parse()
    {
        if (ParseCharacter() or ParseString())
        {
            return true;
        }

        if (ParseNumber())
        {
            return true;
        }

        if (ParseOperator() or ParseDelimiter())
        {
            return true;
        }

        if (ParseKeyword())
        {
            return true;
        }

        return ParseIdentifier();
    }

    bool Readline()
    {
        // 标记是否是最后一行
        bool read = false;
        while (true)
        {
            int c = fgetc(File);

            if (c == EOF)
            {
                if (read)
                {
                    LineCount++;
                }
                return read;
            }
            else if (c == '\n')
            {
                LineCount++;
                return true;
            }

            Buffer.emplace_back((char) c);
            read = true;
        }
    }

    bool ParseKeyword()
    {
        auto begin = Buffer.begin();

        if (KeywordsMap.count(*begin) != 0)
        {
            const auto &array = KeywordsMap.at(*begin);


            for (const auto &i: array)
            {
                if (i.length() > Buffer.size())
                {
                    continue;
                }

                auto pos = Buffer.begin();
                bool flag = true;
                for (auto c: i)
                {
                    if (c != *pos)
                    {
                        flag = false;
                        break;
                    }

                    pos++;
                }

                if (flag)
                {
                    // 同标识符吻合的字符串
                    // 如果是标识符，应该是分隔符或者空格
                    if (pos == Buffer.end() or *pos == ' ' or DelimitersSet.count(*pos) != 0
                        or OperatorsMap.count(*pos) != 0)
                    {
                        KeywordCount++;

                        std::string output;
                        for (auto j = Buffer.begin(); j != pos; j++)
                        {
                            output += *j;
                        }

                        printf("%d <KEYWORD,%s>\n", LineCount, output.c_str());

                        Buffer.erase(Buffer.begin(), pos);

                        return true;
                    }
                }
            }
        }

        return false;
    }

    bool ParseIdentifier()
    {
        auto pos = Buffer.begin();

        if (*pos == '_' or (*pos >= 'A' and *pos <= 'Z') or (*pos >= 'a' and *pos <= 'z'))
        {
            while (*pos == '_' or (*pos >= 'A' and *pos <= 'Z') or (*pos >= 'a' and *pos <= 'z')
                   or (*pos >= '0' and *pos <= '9'))
            {
                pos++;
            }

            if (pos == Buffer.end() or *pos == ' ' or DelimitersSet.count(*pos) != 0
                or OperatorsMap.count(*pos) != 0)
            {
                IdentifierCount++;

                std::string output;
                for (auto i = Buffer.begin(); i != pos; i++)
                {
                    output += *i;
                }

                printf("%d <IDENTIFIER,%s>\n", LineCount, output.c_str());

                Buffer.erase(Buffer.begin(), pos);
                return true;
            }
        }

        return false;
    }

    bool ParseDelimiter()
    {
        auto pos = Buffer.begin();

        if (DelimitersSet.count(*pos) != 0)
        {
            DelimiterCount++;
            printf("%d <DELIMITER,%c>\n", LineCount, *pos);

            Buffer.pop_front();
            return true;
        }

        return false;
    }

    bool ParseOperator()
    {
        auto begin = Buffer.begin();

        if (OperatorsMap.count(*begin))
        {
            const auto &array = OperatorsMap.at(*begin);

            for (const auto &s: array)
            {
                if (s.length() > Buffer.size())
                {
                    continue;
                }

                auto pos = Buffer.begin();
                bool flag = true;

                for (auto i: s)
                {
                    if (i != *pos)
                    {
                        flag = false;
                        break;
                    }

                    pos++;
                }

                if (flag)
                {
                    OperatorCount++;
                    // 感觉，，，，
                    // 可以不用判断运算符的后面是什么
                    std::string output;
                    for (auto i = Buffer.begin(); i != pos; i++)
                    {
                        output += *i;
                    }

                    printf("%d <OPERATOR,%s>\n", LineCount, output.c_str());

                    Buffer.erase(Buffer.begin(), pos);
                    return true;
                }
            }
        }

        return false;
    }

    bool ParseCharacter()
    {
        std::string output;

        auto first = Buffer.begin();
        auto second = first;
        second++;

        if (*first == 'L' or *first == 'u' or *first == 'U')
        {
            if (*second == '\'')
            {
                output += *first;
                Buffer.erase(first, second);
            }
        }

        auto pos = Buffer.begin();

        if (*pos == '\'')
        {
            pos++;
            while (true)
            {
                //处理本行没有闭合的错误
                if (pos == Buffer.end())
                {
                    for (auto c: Buffer)
                    {
                        output += c;
                    }

                    Buffer.clear();
                    ErrorCount++;
                    printf("%d <ERROR,%s>\n", LineCount, output.c_str());

                    return true;
                }

                if (*pos == '\'')
                {
                    break;
                }

                if (*pos == '\\')
                {
                    pos++;
                }

                pos++;
            }
            pos++;

            CharCount++;
            for (auto i = Buffer.begin(); i != pos; i++)
            {
                output += *i;
            }

            printf("%d <CHARCON,%s>\n", LineCount, output.c_str());

            Buffer.erase(Buffer.begin(), pos);
            return true;
        }

        return false;
    }

    bool ParseString()
    {
        auto first = Buffer.begin();
        auto second = first;
        second++;

        std::string output;

        if (*first == 'u' or *first == 'U' or *first == 'L')
        {
            auto third = second;
            third++;
            if (*second == '\"')
            {
                output += *first;
                Buffer.erase(first, second);
            }
            else if (*first == 'u' and *second == '8')
            {
                if (*third == '\"')
                {
                    output = "u8";
                    Buffer.erase(first, third);
                }
            }
        }

        auto pos = Buffer.begin();

        if (*pos == '"')
        {
            pos++;
            while (true)
            {
                //处理本行没有闭合的错误
                if (pos == Buffer.end())
                {
                    for (auto c: Buffer)
                    {
                        output += c;
                    }

                    Buffer.clear();
                    ErrorCount++;
                    printf("%d <ERROR,%s>\n", LineCount, output.c_str());

                    return true;
                }

                if (*pos == '"')
                {
                    break;
                }

                if (*pos == '\\')
                {
                    pos++;
                }

                pos++;
            }
            pos++;

            StringCount++;
            for (auto i = Buffer.begin(); i != pos; i++)
            {
                output += *i;
            }

            printf("%d <STRING,%s>\n", LineCount, output.c_str());
            Buffer.erase(Buffer.begin(), pos);
            return true;
        }

        return false;
    }

    bool ParseNumber()
    {
        auto first = Buffer.begin();
        auto second = first;
        second++;
        auto third = second;
        third++;

        if ((*first >= '0' and *first <= '9') or *first == '.')
        {
            if (*first == '0' and (*second == 'x' or *second == 'X'))
            {
                // 处理十六进制数据
                ParseHexadecimalNumber();
                return true;
            }

            auto pos = Buffer.begin();
            if (*first == '.')
            {
                // 区分小数点和访问符
                if (*second < '0' or *second > '9')
                {
                    return false;
                }

                pos++;
            }

            while (pos != Buffer.end() and *pos >= '0' and *pos <= '9' or *pos == '.')
            {
                pos++;
            }

            if (pos != Buffer.end() and (*pos == 'e' or *pos == 'E' or *pos == '.'))
            {
                pos++;

                if (*pos == '+' or *pos == '-')
                {
                    pos++;
                }

                if (pos == Buffer.end() or *pos < '0' or *pos > '9')
                {
                    // 坏了
                    while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
                           OperatorsMap.count(*pos) == 0 and
                           DelimitersSet.count(*pos) == 0)
                    {
                        pos++;
                    }

                    std::string output;
                    for (auto i = Buffer.begin(); i != pos; i++)
                    {
                        output += *i;
                    }

                    printf("%d <ERROR,%s>\n", LineCount, output.c_str());
                    ErrorCount++;
                    Buffer.erase(Buffer.begin(), pos);
                    return true;
                }
            }

            while (pos != Buffer.end() and *pos >= '0' and *pos <= '9' or *pos == '.')
            {
                pos++;
            }

            std::unordered_set<char> suffixSet = {'u', 'l', 'U', 'L', 'f', 'F'};

            if (pos != Buffer.end() and suffixSet.count(*pos) != 0)
            {
                while (pos != Buffer.end() and suffixSet.count(*pos) != 0)
                {
                    pos++;
                }

                if (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
                    OperatorsMap.count(*pos) == 0 and
                    DelimitersSet.count(*pos) == 0)
                {
                    while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
                           OperatorsMap.count(*pos) == 0 and
                           DelimitersSet.count(*pos) == 0)
                    {
                        pos++;
                    }

                    std::string output;
                    for (auto i = Buffer.begin(); i != pos; i++)
                    {
                        output += *i;
                    }

                    printf("%d <ERROR,%s>\n", LineCount, output.c_str());
                    ErrorCount++;
                    Buffer.erase(Buffer.begin(), pos);
                    return true;
                }

                std::string output;
                for (auto i = Buffer.begin(); i != pos; i++)
                {
                    output += *i;
                }

                printf("%d <NUMBER,%s>\n", LineCount, output.c_str());
                NumberCount++;
                Buffer.erase(Buffer.begin(), pos);
                return true;
            }
            else if (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
                     OperatorsMap.count(*pos) == 0 and
                     DelimitersSet.count(*pos) == 0)
            {
                while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
                       OperatorsMap.count(*pos) == 0 and
                       DelimitersSet.count(*pos) == 0)
                {
                    pos++;
                }

                std::string output;
                for (auto i = Buffer.begin(); i != pos; i++)
                {
                    output += *i;
                }

                printf("%d <ERROR,%s>\n", LineCount, output.c_str());
                ErrorCount++;
                Buffer.erase(Buffer.begin(), pos);
                return true;
            }

            std::string output;
            for (auto i = Buffer.begin(); i != pos; i++)
            {
                output += *i;
            }

            printf("%d <NUMBER,%s>\n", LineCount, output.c_str());
            NumberCount++;
            Buffer.erase(Buffer.begin(), pos);
            return true;
        }

        return false;
    }

    void ParseHexadecimalNumber()
    {
        auto pos = Buffer.begin();
        pos++;
        pos++;

        while (true)
        {
            if ((*pos >= '0' and *pos <= '9') or
                (*pos >= 'A' and *pos <= 'F') or
                (*pos >= 'a' and *pos <= 'f'))
            {
                pos++;
            }
            else if (pos == Buffer.end() or *pos == ' ' or *pos == '\t' or
                     OperatorsMap.count(*pos) != 0 or
                     DelimitersSet.count(*pos) != 0)
            {
                break;
            }
            else
            {
                // 遇到错误
                while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
                       OperatorsMap.count(*pos) == 0 and
                       DelimitersSet.count(*pos) == 0)
                {
                    pos++;
                }

                std::string output;
                for (auto i = Buffer.begin(); i != pos; i++)
                {
                    output += *i;
                }

                printf("%d <ERROR,%s>\n", LineCount, output.c_str());
                Buffer.erase(Buffer.begin(), pos);
                ErrorCount++;
                return;
            }
        }

        std::string output;
        for (auto i = Buffer.begin(); i != pos; i++)
        {
            output += *i;
        }

        printf("%d <NUMBER,%s>\n", LineCount, output.c_str());
        NumberCount++;
        Buffer.erase(Buffer.begin(), pos);
    }

    const std::unordered_map<char, std::vector<std::string>> KeywordsMap = {
            {'a', {"auto"}},
            {'b', {"break"}},
            {'c', {"case",     "char",    "const",  "continue"}},
            {'d', {"double",   "default", "do"}},
            {'e', {"else",     "extern",  "enum"}},
            {'f', {"float",    "for"}},
            {'g', {"goto"}},
            {'i', {"if",       "int"}},
            {'l', {"long"}},
            {'s', {"struct",   "static",  "switch", "short", "signed", "sizeof"}},
            {'r', {"register", "return"}},
            {'t', {"typedef",}},
            {'u', {"union",    "unsigned"}},
            {'v', {"void",     "volatile"}},
            {'w', {"while"}}
    };

    const std::unordered_map<char, std::vector<std::string>> OperatorsMap = {
            {'+', {"++",  "+=", "+"}},
            {'-', {"--",  "-=", "->", "-"}},
            {'*', {"*=",  "*"}},
            {'/', {"/=",  "/"}},
            {'%', {"%=",  "%"}},
            {'=', {"==",  "="}},
            {'!', {"!=",  "!"}},
            {'>', {">>=", ">>", ">=", ">"}},
            {'<', {"<<=", "<<", "<=", "<"}},
            {'&', {"&&",  "&=", "&"}},
            {'|', {"||",  "|=", "|"}},
            {'^', {"^=",  "^"}},
            {'.', {"."}},
            {'~', {"~"}}
    };

    const std::unordered_set<char> DelimitersSet = {
            ';', ',', ':', '?', '(', ')', '[', ']', '{', '}'
    };
};

int main(int argc, char *argv[])
{
    FILE *source_file = fopen(argv[1], "r");

    if (source_file == nullptr || argc != 2)
    {
        printf("Failed to open source File.\n");
    }

    LexicalParser parser(source_file);

    parser.Loop();

    printf("%d\n", parser.LineCount);
    printf("%d %d %d %d %d %d %d\n", parser.KeywordCount,
           parser.IdentifierCount,
           parser.OperatorCount,
           parser.DelimiterCount,
           parser.CharCount,
           parser.StringCount,
           parser.NumberCount);
    printf("%d", parser.ErrorCount);

    fclose(source_file);
    return 0;
}