hello-mlir/include/Lexer.h

//===- Lexer.h - Lexer for the Toy language -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a simple Lexer for the Toy language.
//
//===----------------------------------------------------------------------===//

#ifndef TOY_LEXER_H
#define TOY_LEXER_H

#include "llvm/ADT/StringRef.h"

#include <memory>
#include <string>

#include "SyntaxNode.h"

namespace hello
{
    // List of Token returned by the lexer.
    enum Token : int
    {
        tok_semicolon = ';',
        tok_parenthese_open = '(',
        tok_parenthese_close = ')',
        tok_bracket_open = '{',
        tok_bracket_close = '}',
        tok_sbracket_open = '[',
        tok_sbracket_close = ']',

        tok_eof = -1,

        // commands
        tok_return = -2,
        tok_var = -3,
        tok_def = -4,

        // primary
        tok_identifier = -5,
        tok_number = -6,
    };

    /// The Lexer is an abstract base class providing all the facilities that the
    /// Parser expects. It goes through the stream one token at a time and keeps
    /// track of the location in the file for debugging purposes.
    /// It relies on a subclass to provide a `readNextLine()` method. The subclass
    /// can proceed by reading the next line from the standard input or from a
    /// memory mapped file.
    class Lexer
    {
    public:
        /// Create a lexer for the given filename. The filename is kept only for
        /// debugging purposes (attaching a location to a Token).
        explicit Lexer(std::string filename)
            : lastLocation(
                {std::make_shared<std::string>(std::move(filename)), 0, 0})
        {
        }

        virtual ~Lexer() = default;

        /// Look at the current token in the stream.
        Token getCurToken() const { return curTok; }

        /// Move to the next token in the stream and return it.
        Token getNextToken() { return curTok = getTok(); }

        /// Move to the next token in the stream, asserting on the current token
        /// matching the expectation.
        void consume(Token tok)
        {
            assert(tok == curTok && "consume Token mismatch expectation");
            getNextToken();
        }

        /// Return the current identifier (prereq: getCurToken() == tok_identifier)
        llvm::StringRef getId()
        {
            assert(curTok == tok_identifier);
            return identifierStr;
        }

        /// Return the current number (prereq: getCurToken() == tok_number)
        double getValue()
        {
            assert(curTok == tok_number);
            return numVal;
        }

        /// Return the location for the beginning of the current token.
        Location getLastLocation() { return lastLocation; }

        // Return the current line in the file.
        int getLine() const { return curLineNum; }

        // Return the current column in the file.
        int getCol() const { return curCol; }

    private:
        /// Delegate to a derived class fetching the next line. Returns an empty
        /// string to signal end of file (EOF). Lines are expected to always finish
        /// with "\n"
        virtual llvm::StringRef readNextLine() = 0;

        /// Return the next character from the stream. This manages the buffer for the
        /// current line and request the next line buffer to the derived class as
        /// needed.
        int getNextChar()
        {
            // The current line buffer should not be empty unless it is the end of file.
            if (curLineBuffer.empty())
                return EOF;
            ++curCol;
            auto nextchar = curLineBuffer.front();
            curLineBuffer = curLineBuffer.drop_front();
            if (curLineBuffer.empty())
                curLineBuffer = readNextLine();
            if (nextchar == '\n')
            {
                ++curLineNum;
                curCol = 0;
            }
            return nextchar;
        }

        ///  Return the next token from standard input.
        Token getTok()
        {
            // Skip any whitespace.
            while (isspace(lastChar))
                lastChar = Token(getNextChar());

            // Save the current location before reading the token characters.
            lastLocation.line = curLineNum;
            lastLocation.col = curCol;

            // Identifier: [a-zA-Z][a-zA-Z0-9_]*
            if (isalpha(lastChar))
            {
                identifierStr = (char)lastChar;
                while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
                    identifierStr += (char)lastChar;

                if (identifierStr == "return")
                    return tok_return;
                if (identifierStr == "def")
                    return tok_def;
                if (identifierStr == "var")
                    return tok_var;
                return tok_identifier;
            }

            // Number: [0-9.]+
            if (isdigit(lastChar) || lastChar == '.')
            {
                std::string numStr;
                do
                {
                    numStr += lastChar;
                    lastChar = Token(getNextChar());
                }
                while (isdigit(lastChar) || lastChar == '.');

                numVal = strtod(numStr.c_str(), nullptr);
                return tok_number;
            }

            if (lastChar == '#')
            {
                // Comment until end of line.
                do
                {
                    lastChar = Token(getNextChar());
                }
                while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');

                if (lastChar != EOF)
                    return getTok();
            }

            // Check for end of file.  Don't eat the EOF.
            if (lastChar == EOF)
                return tok_eof;

            // Otherwise, just return the character as its ascii value.
            Token thisChar = Token(lastChar);
            lastChar = Token(getNextChar());
            return thisChar;
        }

        /// The last token read from the input.
        Token curTok = tok_eof;

        /// Location for `curTok`.
        Location lastLocation;

        /// If the current Token is an identifier, this string contains the value.
        std::string identifierStr;

        /// If the current Token is a number, this contains the value.
        double numVal = 0;

        /// The last value returned by getNextChar(). We need to keep it around as we
        /// always need to read ahead one character to decide when to end a token and
        /// we can't put it back in the stream after reading from it.
        Token lastChar = Token(' ');

        /// Keep track of the current line number in the input stream
        int curLineNum = 0;

        /// Keep track of the current column number in the input stream
        int curCol = 0;

        /// Buffer supplied by the derived class on calls to `readNextLine()`
        llvm::StringRef curLineBuffer = "\n";
    };

    /// A lexer implementation operating on a buffer in memory.
    class LexerBuffer final : public Lexer
    {
    public:
        LexerBuffer(const char* begin, const char* end, std::string filename)
            : Lexer(std::move(filename)), current(begin), end(end)
        {
        }

    private:
        /// Provide one line at a time to the Lexer, return an empty string when
        /// reaching the end of the buffer.
        llvm::StringRef readNextLine() override
        {
            auto* begin = current;
            while (current <= end && *current && *current != '\n')
                ++current;
            if (current <= end && *current)
                ++current;

            return llvm::StringRef{begin, static_cast<size_t>(current - begin)};
        }

        const char *current, *end;
    };
} // namespace toy

#endif // TOY_LEXER_H