RubbishBin/LexicalParser/LexicalParser.cpp

752 lines
20 KiB
C++

#include <cstdio>
#include <list>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <string>
class LexicalParser
{
public:
int LineCount = 0;
int KeywordCount = 0;
int IdentifierCount = 0;
int OperatorCount = 0;
int DelimiterCount = 0;
int CharCount = 0;
int StringCount = 0;
int NumberCount = 0;
int ErrorCount = 0;
explicit LexicalParser(FILE *file)
{
this->File = file;
}
void Loop()
{
bool mark = false;
while (Readline())
{
while (!Buffer.empty())
{
if (mark)
{
// 多行注释
for (auto i = Buffer.begin(); i != Buffer.end(); i++)
{
if (*i == '*')
{
i++;
if (i != Buffer.end() and *i == '/')
{
mark = false;
i++;
if (i == Buffer.end())
{
Buffer.clear();
}
else
{
Buffer.erase(Buffer.begin(), i);
}
break;
}
}
}
if (mark)
{
// 说明在这行没有找到结束符
Buffer.clear();
continue;
}
// 这行读取完成了
if (Buffer.empty())
{
continue;
}
}
auto pos = Buffer.begin();
if (*pos == '/')
{
pos++;
if (pos != Buffer.end())
{
// 判断单行注释
if (*pos == '/')
{
Buffer.clear();
continue;
}
else if (*pos == '*')
{
mark = true;
pos++;
Buffer.erase(Buffer.begin(), pos);
continue;
}
}
}
if (*Buffer.begin() == ' ' or *Buffer.begin() == '\t')
{
Buffer.pop_front();
continue;
}
// 处理特殊错误 @
if (*Buffer.begin() == '@')
{
Buffer.pop_front();
ErrorCount++;
printf("%d <ERROR,@>\n", LineCount);
continue;
}
if (!Parse())
{
return;
}
}
}
}
private:
std::list<char> Buffer;
FILE *File;
bool Parse()
{
if (ParseCharacter() or ParseString())
{
return true;
}
if (ParseNumber())
{
return true;
}
if (ParseOperator() or ParseDelimiter())
{
return true;
}
if (ParseKeyword())
{
return true;
}
return ParseIdentifier();
}
bool Readline()
{
// 标记是否是最后一行
bool read = false;
while (true)
{
int c = fgetc(File);
if (c == EOF)
{
if (read)
{
LineCount++;
}
return read;
}
else if (c == '\n')
{
LineCount++;
return true;
}
Buffer.emplace_back((char) c);
read = true;
}
}
bool ParseKeyword()
{
auto begin = Buffer.begin();
if (KeywordsMap.count(*begin) != 0)
{
const auto &array = KeywordsMap.at(*begin);
for (const auto &i: array)
{
if (i.length() > Buffer.size())
{
continue;
}
auto pos = Buffer.begin();
bool flag = true;
for (auto c: i)
{
if (c != *pos)
{
flag = false;
break;
}
pos++;
}
if (flag)
{
// 同标识符吻合的字符串
// 如果是标识符,应该是分隔符或者空格
if (pos == Buffer.end() or *pos == ' ' or DelimitersSet.count(*pos) != 0
or OperatorsMap.count(*pos) != 0)
{
KeywordCount++;
std::string output;
for (auto j = Buffer.begin(); j != pos; j++)
{
output += *j;
}
printf("%d <KEYWORD,%s>\n", LineCount, output.c_str());
Buffer.erase(Buffer.begin(), pos);
return true;
}
}
}
}
return false;
}
bool ParseIdentifier()
{
auto pos = Buffer.begin();
if (*pos == '_' or (*pos >= 'A' and *pos <= 'Z') or (*pos >= 'a' and *pos <= 'z'))
{
while (*pos == '_' or (*pos >= 'A' and *pos <= 'Z') or (*pos >= 'a' and *pos <= 'z')
or (*pos >= '0' and *pos <= '9'))
{
pos++;
}
if (pos == Buffer.end() or *pos == ' ' or DelimitersSet.count(*pos) != 0
or OperatorsMap.count(*pos) != 0)
{
IdentifierCount++;
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <IDENTIFIER,%s>\n", LineCount, output.c_str());
Buffer.erase(Buffer.begin(), pos);
return true;
}
}
return false;
}
bool ParseDelimiter()
{
auto pos = Buffer.begin();
if (DelimitersSet.count(*pos) != 0)
{
DelimiterCount++;
printf("%d <DELIMITER,%c>\n", LineCount, *pos);
Buffer.pop_front();
return true;
}
return false;
}
bool ParseOperator()
{
auto begin = Buffer.begin();
if (OperatorsMap.count(*begin))
{
const auto &array = OperatorsMap.at(*begin);
for (const auto &s: array)
{
if (s.length() > Buffer.size())
{
continue;
}
auto pos = Buffer.begin();
bool flag = true;
for (auto i: s)
{
if (i != *pos)
{
flag = false;
break;
}
pos++;
}
if (flag)
{
OperatorCount++;
// 感觉,,,,
// 可以不用判断运算符的后面是什么
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <OPERATOR,%s>\n", LineCount, output.c_str());
Buffer.erase(Buffer.begin(), pos);
return true;
}
}
}
return false;
}
bool ParseCharacter()
{
std::string output;
auto first = Buffer.begin();
auto second = first;
second++;
if (*first == 'L' or *first == 'u' or *first == 'U')
{
if (*second == '\'')
{
output += *first;
Buffer.erase(first, second);
}
}
auto pos = Buffer.begin();
if (*pos == '\'')
{
pos++;
while (true)
{
//处理本行没有闭合的错误
if (pos == Buffer.end())
{
for (auto c: Buffer)
{
output += c;
}
Buffer.clear();
ErrorCount++;
printf("%d <ERROR,%s>\n", LineCount, output.c_str());
return true;
}
if (*pos == '\'')
{
break;
}
if (*pos == '\\')
{
pos++;
}
pos++;
}
pos++;
CharCount++;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <CHARCON,%s>\n", LineCount, output.c_str());
Buffer.erase(Buffer.begin(), pos);
return true;
}
return false;
}
bool ParseString()
{
auto first = Buffer.begin();
auto second = first;
second++;
std::string output;
if (*first == 'u' or *first == 'U' or *first == 'L')
{
auto third = second;
third++;
if (*second == '\"')
{
output += *first;
Buffer.erase(first, second);
}
else if (*first == 'u' and *second == '8')
{
if (*third == '\"')
{
output = "u8";
Buffer.erase(first, third);
}
}
}
auto pos = Buffer.begin();
if (*pos == '"')
{
pos++;
while (true)
{
//处理本行没有闭合的错误
if (pos == Buffer.end())
{
for (auto c: Buffer)
{
output += c;
}
Buffer.clear();
ErrorCount++;
printf("%d <ERROR,%s>\n", LineCount, output.c_str());
return true;
}
if (*pos == '"')
{
break;
}
if (*pos == '\\')
{
pos++;
}
pos++;
}
pos++;
StringCount++;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <STRING,%s>\n", LineCount, output.c_str());
Buffer.erase(Buffer.begin(), pos);
return true;
}
return false;
}
bool ParseNumber()
{
auto first = Buffer.begin();
auto second = first;
second++;
auto third = second;
third++;
if ((*first >= '0' and *first <= '9') or *first == '.')
{
if (*first == '0' and (*second == 'x' or *second == 'X'))
{
// 处理十六进制数据
ParseHexadecimalNumber();
return true;
}
auto pos = Buffer.begin();
if (*first == '.')
{
// 区分小数点和访问符
if (*second < '0' or *second > '9')
{
return false;
}
pos++;
}
while (pos != Buffer.end() and *pos >= '0' and *pos <= '9' or *pos == '.')
{
pos++;
}
if (pos != Buffer.end() and (*pos == 'e' or *pos == 'E' or *pos == '.'))
{
pos++;
if (*pos == '+' or *pos == '-')
{
pos++;
}
if (pos == Buffer.end() or *pos < '0' or *pos > '9')
{
// 坏了
while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
OperatorsMap.count(*pos) == 0 and
DelimitersSet.count(*pos) == 0)
{
pos++;
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <ERROR,%s>\n", LineCount, output.c_str());
ErrorCount++;
Buffer.erase(Buffer.begin(), pos);
return true;
}
}
while (pos != Buffer.end() and *pos >= '0' and *pos <= '9' or *pos == '.')
{
pos++;
}
std::unordered_set<char> suffixSet = {'u', 'l', 'U', 'L', 'f', 'F'};
if (pos != Buffer.end() and suffixSet.count(*pos) != 0)
{
while (pos != Buffer.end() and suffixSet.count(*pos) != 0)
{
pos++;
}
if (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
OperatorsMap.count(*pos) == 0 and
DelimitersSet.count(*pos) == 0)
{
while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
OperatorsMap.count(*pos) == 0 and
DelimitersSet.count(*pos) == 0)
{
pos++;
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <ERROR,%s>\n", LineCount, output.c_str());
ErrorCount++;
Buffer.erase(Buffer.begin(), pos);
return true;
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <NUMBER,%s>\n", LineCount, output.c_str());
NumberCount++;
Buffer.erase(Buffer.begin(), pos);
return true;
}
else if (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
OperatorsMap.count(*pos) == 0 and
DelimitersSet.count(*pos) == 0)
{
while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
OperatorsMap.count(*pos) == 0 and
DelimitersSet.count(*pos) == 0)
{
pos++;
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <ERROR,%s>\n", LineCount, output.c_str());
ErrorCount++;
Buffer.erase(Buffer.begin(), pos);
return true;
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <NUMBER,%s>\n", LineCount, output.c_str());
NumberCount++;
Buffer.erase(Buffer.begin(), pos);
return true;
}
return false;
}
void ParseHexadecimalNumber()
{
auto pos = Buffer.begin();
pos++;
pos++;
while (true)
{
if ((*pos >= '0' and *pos <= '9') or
(*pos >= 'A' and *pos <= 'F') or
(*pos >= 'a' and *pos <= 'f'))
{
pos++;
}
else if (pos == Buffer.end() or *pos == ' ' or *pos == '\t' or
OperatorsMap.count(*pos) != 0 or
DelimitersSet.count(*pos) != 0)
{
break;
}
else
{
// 遇到错误
while (pos != Buffer.end() and *pos != ' ' and *pos != '\t' and
OperatorsMap.count(*pos) == 0 and
DelimitersSet.count(*pos) == 0)
{
pos++;
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <ERROR,%s>\n", LineCount, output.c_str());
Buffer.erase(Buffer.begin(), pos);
ErrorCount++;
return;
}
}
std::string output;
for (auto i = Buffer.begin(); i != pos; i++)
{
output += *i;
}
printf("%d <NUMBER,%s>\n", LineCount, output.c_str());
NumberCount++;
Buffer.erase(Buffer.begin(), pos);
}
const std::unordered_map<char, std::vector<std::string>> KeywordsMap = {
{'a', {"auto"}},
{'b', {"break"}},
{'c', {"case", "char", "const", "continue"}},
{'d', {"double", "default", "do"}},
{'e', {"else", "extern", "enum"}},
{'f', {"float", "for"}},
{'g', {"goto"}},
{'i', {"if", "int"}},
{'l', {"long"}},
{'s', {"struct", "static", "switch", "short", "signed", "sizeof"}},
{'r', {"register", "return"}},
{'t', {"typedef",}},
{'u', {"union", "unsigned"}},
{'v', {"void", "volatile"}},
{'w', {"while"}}
};
const std::unordered_map<char, std::vector<std::string>> OperatorsMap = {
{'+', {"++", "+=", "+"}},
{'-', {"--", "-=", "->", "-"}},
{'*', {"*=", "*"}},
{'/', {"/=", "/"}},
{'%', {"%=", "%"}},
{'=', {"==", "="}},
{'!', {"!=", "!"}},
{'>', {">>=", ">>", ">=", ">"}},
{'<', {"<<=", "<<", "<=", "<"}},
{'&', {"&&", "&=", "&"}},
{'|', {"||", "|=", "|"}},
{'^', {"^=", "^"}},
{'.', {"."}},
{'~', {"~"}}
};
const std::unordered_set<char> DelimitersSet = {
';', ',', ':', '?', '(', ')', '[', ']', '{', '}'
};
};
int main(int argc, char *argv[])
{
FILE *source_file = fopen(argv[1], "r");
if (source_file == nullptr || argc != 2)
{
printf("Failed to open source File.\n");
}
LexicalParser parser(source_file);
parser.Loop();
printf("%d\n", parser.LineCount);
printf("%d %d %d %d %d %d %d\n", parser.KeywordCount,
parser.IdentifierCount,
parser.OperatorCount,
parser.DelimiterCount,
parser.CharCount,
parser.StringCount,
parser.NumberCount);
printf("%d", parser.ErrorCount);
fclose(source_file);
return 0;
}