251 lines
8.0 KiB
C++
251 lines
8.0 KiB
C++
//===- Lexer.h - Lexer for the Toy language -------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements a simple Lexer for the Toy language.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef TOY_LEXER_H
|
|
#define TOY_LEXER_H
|
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
#include "SyntaxNode.h"
|
|
|
|
namespace hello
|
|
{
|
|
// List of Token returned by the lexer.
|
|
enum Token : int
|
|
{
|
|
tok_semicolon = ';',
|
|
tok_parenthese_open = '(',
|
|
tok_parenthese_close = ')',
|
|
tok_bracket_open = '{',
|
|
tok_bracket_close = '}',
|
|
tok_sbracket_open = '[',
|
|
tok_sbracket_close = ']',
|
|
|
|
tok_eof = -1,
|
|
|
|
// commands
|
|
tok_return = -2,
|
|
tok_var = -3,
|
|
tok_def = -4,
|
|
|
|
// primary
|
|
tok_identifier = -5,
|
|
tok_number = -6,
|
|
};
|
|
|
|
/// The Lexer is an abstract base class providing all the facilities that the
|
|
/// Parser expects. It goes through the stream one token at a time and keeps
|
|
/// track of the location in the file for debugging purposes.
|
|
/// It relies on a subclass to provide a `readNextLine()` method. The subclass
|
|
/// can proceed by reading the next line from the standard input or from a
|
|
/// memory mapped file.
|
|
class Lexer
|
|
{
|
|
public:
|
|
/// Create a lexer for the given filename. The filename is kept only for
|
|
/// debugging purposes (attaching a location to a Token).
|
|
explicit Lexer(std::string filename)
|
|
: lastLocation(
|
|
{std::make_shared<std::string>(std::move(filename)), 0, 0})
|
|
{
|
|
}
|
|
|
|
virtual ~Lexer() = default;
|
|
|
|
/// Look at the current token in the stream.
|
|
Token getCurToken() const { return curTok; }
|
|
|
|
/// Move to the next token in the stream and return it.
|
|
Token getNextToken() { return curTok = getTok(); }
|
|
|
|
/// Move to the next token in the stream, asserting on the current token
|
|
/// matching the expectation.
|
|
void consume(Token tok)
|
|
{
|
|
assert(tok == curTok && "consume Token mismatch expectation");
|
|
getNextToken();
|
|
}
|
|
|
|
/// Return the current identifier (prereq: getCurToken() == tok_identifier)
|
|
llvm::StringRef getId()
|
|
{
|
|
assert(curTok == tok_identifier);
|
|
return identifierStr;
|
|
}
|
|
|
|
/// Return the current number (prereq: getCurToken() == tok_number)
|
|
double getValue()
|
|
{
|
|
assert(curTok == tok_number);
|
|
return numVal;
|
|
}
|
|
|
|
/// Return the location for the beginning of the current token.
|
|
Location getLastLocation() { return lastLocation; }
|
|
|
|
// Return the current line in the file.
|
|
int getLine() const { return curLineNum; }
|
|
|
|
// Return the current column in the file.
|
|
int getCol() const { return curCol; }
|
|
|
|
private:
|
|
/// Delegate to a derived class fetching the next line. Returns an empty
|
|
/// string to signal end of file (EOF). Lines are expected to always finish
|
|
/// with "\n"
|
|
virtual llvm::StringRef readNextLine() = 0;
|
|
|
|
/// Return the next character from the stream. This manages the buffer for the
|
|
/// current line and request the next line buffer to the derived class as
|
|
/// needed.
|
|
int getNextChar()
|
|
{
|
|
// The current line buffer should not be empty unless it is the end of file.
|
|
if (curLineBuffer.empty())
|
|
return EOF;
|
|
++curCol;
|
|
auto nextchar = curLineBuffer.front();
|
|
curLineBuffer = curLineBuffer.drop_front();
|
|
if (curLineBuffer.empty())
|
|
curLineBuffer = readNextLine();
|
|
if (nextchar == '\n')
|
|
{
|
|
++curLineNum;
|
|
curCol = 0;
|
|
}
|
|
return nextchar;
|
|
}
|
|
|
|
/// Return the next token from standard input.
|
|
Token getTok()
|
|
{
|
|
// Skip any whitespace.
|
|
while (isspace(lastChar))
|
|
lastChar = Token(getNextChar());
|
|
|
|
// Save the current location before reading the token characters.
|
|
lastLocation.line = curLineNum;
|
|
lastLocation.col = curCol;
|
|
|
|
// Identifier: [a-zA-Z][a-zA-Z0-9_]*
|
|
if (isalpha(lastChar))
|
|
{
|
|
identifierStr = (char)lastChar;
|
|
while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
|
|
identifierStr += (char)lastChar;
|
|
|
|
if (identifierStr == "return")
|
|
return tok_return;
|
|
if (identifierStr == "def")
|
|
return tok_def;
|
|
if (identifierStr == "var")
|
|
return tok_var;
|
|
return tok_identifier;
|
|
}
|
|
|
|
// Number: [0-9.]+
|
|
if (isdigit(lastChar) || lastChar == '.')
|
|
{
|
|
std::string numStr;
|
|
do
|
|
{
|
|
numStr += lastChar;
|
|
lastChar = Token(getNextChar());
|
|
}
|
|
while (isdigit(lastChar) || lastChar == '.');
|
|
|
|
numVal = strtod(numStr.c_str(), nullptr);
|
|
return tok_number;
|
|
}
|
|
|
|
if (lastChar == '#')
|
|
{
|
|
// Comment until end of line.
|
|
do
|
|
{
|
|
lastChar = Token(getNextChar());
|
|
}
|
|
while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
|
|
|
|
if (lastChar != EOF)
|
|
return getTok();
|
|
}
|
|
|
|
// Check for end of file. Don't eat the EOF.
|
|
if (lastChar == EOF)
|
|
return tok_eof;
|
|
|
|
// Otherwise, just return the character as its ascii value.
|
|
Token thisChar = Token(lastChar);
|
|
lastChar = Token(getNextChar());
|
|
return thisChar;
|
|
}
|
|
|
|
/// The last token read from the input.
|
|
Token curTok = tok_eof;
|
|
|
|
/// Location for `curTok`.
|
|
Location lastLocation;
|
|
|
|
/// If the current Token is an identifier, this string contains the value.
|
|
std::string identifierStr;
|
|
|
|
/// If the current Token is a number, this contains the value.
|
|
double numVal = 0;
|
|
|
|
/// The last value returned by getNextChar(). We need to keep it around as we
|
|
/// always need to read ahead one character to decide when to end a token and
|
|
/// we can't put it back in the stream after reading from it.
|
|
Token lastChar = Token(' ');
|
|
|
|
/// Keep track of the current line number in the input stream
|
|
int curLineNum = 0;
|
|
|
|
/// Keep track of the current column number in the input stream
|
|
int curCol = 0;
|
|
|
|
/// Buffer supplied by the derived class on calls to `readNextLine()`
|
|
llvm::StringRef curLineBuffer = "\n";
|
|
};
|
|
|
|
/// A lexer implementation operating on a buffer in memory.
|
|
class LexerBuffer final : public Lexer
|
|
{
|
|
public:
|
|
LexerBuffer(const char* begin, const char* end, std::string filename)
|
|
: Lexer(std::move(filename)), current(begin), end(end)
|
|
{
|
|
}
|
|
|
|
private:
|
|
/// Provide one line at a time to the Lexer, return an empty string when
|
|
/// reaching the end of the buffer.
|
|
llvm::StringRef readNextLine() override
|
|
{
|
|
auto* begin = current;
|
|
while (current <= end && *current && *current != '\n')
|
|
++current;
|
|
if (current <= end && *current)
|
|
++current;
|
|
|
|
return llvm::StringRef{begin, static_cast<size_t>(current - begin)};
|
|
}
|
|
|
|
const char *current, *end;
|
|
};
|
|
} // namespace toy
|
|
|
|
#endif // TOY_LEXER_H
|