Lexical.cpp Copyright (c) Friday, April 4, 2025, by James Pate Williams, Jr. Reference: “Modern Compiler Implementation in Java Second Edition” (c) 2002 by Andrew W. Appel with Jens Palsberg Chapter Two, Lexical Analysis

// Lexical.cpp
// Copyright (c) Friday, April 4, 2025, by
// James Pate Williams, Jr.
// Reference: "Modern Compiler Implementation
// in Java Second Edition" (c) 2002
// Andrew W. Appel with Jens Palsberg
// Chapter Two, Lexical Analysis

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>

std::vector<char> AlphabeticChars = { 'A', 'B', 'C', 'D', 'E', 'F',
    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
    's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };
std::vector<char> BracketChars = { '(', ')', '{', '}', '[', ']' };
std::vector<char> ArithmeticOps = { '+', '-', '*', '/', '%' };
std::vector<char> LogicalOps = { '&', '^', '|' };
std::vector<char> DecimalDigits = { '0', '1', '2', '4', '5', '6',
    '7', '8', '9' };
std::vector<char> WhiteSpace = { ' ', '\t', '\n' };
std::vector<std::string> ReservedWords = {
    "do", "double", "else", "for", "if", "int", "long", "while" };
char reservedWordsAscii[8][8] = { { } };
std::vector<char> AlphaNumericChars;

bool TokenScan(
    std::string wline,
    bool& arithmeticOp,
    bool& bracket,
    bool& endOfLine,
    bool& id,
    bool& intNumber,
    bool& logicalOp,
    bool& realNumber,
    bool& reservedWord,
    bool& whiteSpace,
    char& arithmeticOpChar,
    char& bracketChar,
    char& ch,
    char& endOfLineChar,
    char& whiteSpaceChar,
    size_t& cptr,
    std::string& idStr,
    std::string& intNumberStr,
    std::string& realNumberStr,
    std::string& reservedWordStr) {
    const char* line = wline.c_str();

    ch = line[cptr];
    while (ch == ' ' || ch == '\t' || ch == '\n' && cptr < strlen(line)) {
        cptr++;
        ch = line[cptr];
    }
    if (ch == '\n' || cptr == strlen(line)) {
        endOfLine = true;
        endOfLineChar = '\n';
        return true;
    }
    ch = line[cptr];
    if (ch >= 'A' && ch <= 'Z' ||
        ch >= 'a' && ch <= 'z') {
        idStr += ch;
        cptr++;
        while (cptr < strlen(line)) {
            ch = line[cptr];
            if (ch >= '0' && ch <= '9' ||
                ch >= 'A' && ch <= 'Z' ||
                ch >= 'a' && ch <= 'z' ||
                ch == '_') {
                idStr += ch;
                cptr++;
            }
            else if (ch == ' ' || ch == '\t' || ch == '\n') {
                cptr++;
                break;
            }
            else {
                id = false;
                return false;
            }
        }
        id = true;
        char reservedWordAscii[8] = { };
        reservedWordStr = "";
        for (size_t i = 0; i < strlen(idStr.c_str()); i++) {
            reservedWordAscii[i] = idStr.c_str()[i];
        }
        reservedWordAscii[strlen(idStr.c_str())] = '\0';
        for (size_t i = 0; i < 8; i++) {
            size_t count = 0;
            reservedWord = false;
            reservedWordStr = "";
            for (size_t j = 0; j < strlen(reservedWordsAscii[i]); j++) {
                if (reservedWordAscii[j] == reservedWordsAscii[i][j]) {
                    reservedWord = true;
                    reservedWordStr += idStr[j];
                    count++;
                }
            }
            if (reservedWord && count == strlen(reservedWordsAscii[i])) {
                break;
            }
        }
        if (reservedWord) {
            id = false;
        }
        if (id) {
            id = true;
            reservedWord = false;
            reservedWordStr = "";
        }
        else {
            id = false;
            idStr = "";
        }
        return id || reservedWord;
    }
    else if (ch >= '0' && ch <= '9')
    {
        intNumberStr = "";
        realNumberStr = "";
        intNumberStr += ch;
        realNumberStr += ch;
        cptr++;
        ch = line[cptr];
        if (ch >= '0' && ch <= '9') {
            intNumberStr += ch;
            cptr++;
            ch = line[cptr];
            while (cptr < strlen(line)) {
                ch = line[cptr];
                if (ch >= '0' && ch <= '9') {
                    intNumberStr += ch;
                    cptr++;
                }
                else if (ch == ' ' || ch == '\t' || ch == '\n') {
                    cptr++;
                    break;
                }
                else {
                    intNumber = false;
                    return false;
                }
            }
            intNumber = true;
            return true;
        }
        else if (ch == '.') {
            realNumberStr += ch;
            cptr++;
            ch = line[cptr];
            if (ch >= '0' && ch <= '9') {
                realNumberStr += ch;
                cptr++;
                while (cptr < (int)strlen(line)) {
                    ch = line[cptr];
                    if (ch >= '0' && ch <= '9') {
                        realNumberStr += ch;
                        cptr++;
                    }
                    else if (ch == ' ' || ch == '\t' || ch == '\n') {
                        cptr++;
                        break;
                    }
                    else {
                        realNumber = false;
                        return false;
                    }
                }
                realNumber = true;
                return true;
            }
        }
    }
    else if (ch == '.') {
        realNumberStr += ch;
        cptr++;
        ch = line[cptr];
        if (ch >= '0' && ch <= '9') {
            realNumberStr += ch;
            cptr++;
            while (cptr < (int)strlen(line)) {
                ch = line[cptr];
                if (ch >= '0' && ch <= '9') {
                    realNumberStr += ch;
                    cptr++;
                }
                else if (ch == ' ' || ch == '\t' || ch == '\n') {
                    cptr++;
                    break;
                }
                else {
                    realNumber = false;
                    return false;
                }
            }
            realNumber = true;
            return true;
        }
        else {
            realNumber = false;
            return false;
        }
    }

    return false;
}

bool LineScan(
    std::string wline,
    bool& arithmeticOp,
    bool& bracket,
    bool& endOfLine,
    bool& id,
    bool& intNumber,
    bool& logicalOp,
    bool& realNumber,
    bool& reservedWord,
    bool& whiteSpace,
    char& arithmeticOpChar,
    char& bracketChar,
    char& ch,
    char& endOfLineChar,
    char& whiteSpaceChar,
    size_t& cptr,
    std::string& idStr,
    std::string& intNumberStr,
    std::string& realNumberStr,
    std::string& reservedWordStr)
{
    const char* line = wline.c_str();
    ch = line[cptr];
    while (ch == ' ' || ch == '\t' || ch == '\n' && cptr < strlen(line)) {
        cptr++;
        ch = line[cptr];
    }
    if (ch == '\n' || cptr == strlen(line)) {
        endOfLine = true;
        endOfLineChar = '\n';
        return true;
    }

    arithmeticOp = bracket = endOfLine = id = false;
    intNumber = logicalOp = realNumber = whiteSpace = false;
    arithmeticOpChar = '\0', bracketChar = '\0', ch = '\0';
    endOfLineChar = '\0', whiteSpaceChar = '\0';

    if (TokenScan(
        line,
        arithmeticOp,
        bracket,
        endOfLine,
        id,
        intNumber,
        logicalOp,
        realNumber,
        reservedWord,
        whiteSpace,
        arithmeticOpChar,
        bracketChar,
        ch,
        endOfLineChar,
        whiteSpaceChar,
        cptr,
        idStr,
        intNumberStr,
        realNumberStr,
        reservedWordStr)) {
        if (id || intNumber || realNumber || reservedWord) {
            return true;
        }
        if (endOfLine) {
            return true;
        }
        ch = line[cptr];
        if (ch == ' ' || ch == '\t' || ch == '\n') {
            cptr++;
            ch = line[cptr];
        }
        else if (ch == '\n') {
            cptr++;
            ch = line[cptr];
            endOfLine = true;
            return true;
        }
    }
    else if (ch == '\\') {
        cptr++;
        if (cptr == strlen(line)) {
            return false;
        }
        ch = line[cptr];
        if (ch == '\\') {
            // found a one-line comment
            // skip until end-of-line
            cptr++;
            ch = line[cptr];
            while (cptr < strlen(line) && ch != '\n') {
                ch = line[cptr++];
            }
            return ch == '\n';
        }
        else {
            // single \ found
            return false;
        }
    }
    const auto itao = std::find(ArithmeticOps.begin(), ArithmeticOps.end(), ch);
    if (itao != ArithmeticOps.end()) {
        arithmeticOp = true;
        return true;
    }
    const auto itbc = std::find(BracketChars.begin(), BracketChars.end(), ch);
    if (itbc != BracketChars.end()) {
        bracketChar = true;
        return true;
    }
    const auto itlo = std::find(LogicalOps.begin(), LogicalOps.end(), ch);
    if (itlo != LogicalOps.end()) {
        logicalOp = true;
        return true;
    }
    const auto itdd = std::find(DecimalDigits.begin(), DecimalDigits.end(), ch);
    if (itdd != DecimalDigits.end()) {
        intNumberStr += ch;
        cptr++;
        ch = line[cptr];
        while (ch >= '0' && ch <= '9') {
            intNumberStr += ch;
        }
        const auto iws = std::find(WhiteSpace.begin(), WhiteSpace.end(), ch);
        if (iws != WhiteSpace.end()) {
            intNumber = intNumberStr.size() > 1;
            whiteSpace = true;
            return true;
        }
        else {
            return false;
        }
    }

    return false;
}

int main()
{
    for (size_t i = 0; i < AlphabeticChars.size(); i++) {
        AlphaNumericChars.push_back(AlphabeticChars[i]);
    }
    for (size_t i = 0; i < DecimalDigits.size(); i++) {
        AlphaNumericChars.push_back(DecimalDigits[i]);
    }
    for (size_t i = 0; i < 8; i++) {
        for (size_t j = 0; j < strlen(ReservedWords[i].c_str()); j++) {
            reservedWordsAscii[i][j] = ReservedWords[i].c_str()[j];
        }
        reservedWordsAscii[i][strlen(ReservedWords[i].c_str())] = '\0';
    }

    const char* line0 = { };
    std::string line1 = "abc4 def_5 amp c1\n";
    std::string line2 = ".1234 0.4567 9876 while for\n";
    std::string line3 = "\\\\this is a one-line comment\n";
    std::string line4 = "a + b / c\n";

    for (int i = 1; i <= 4; i++) {
        bool arithmeticOp = false, bracket = false, endOfLine = false;
        bool id = false, intNumber = false, logicalOp = false, realNumber = false;
        bool reservedWord = false, whiteSpace = false;
        char arithmeticOpChar = '\0', bracketChar = '\0', ch = '\0';
        char endOfLineChar = '\0', logicalOpChar = '\0', whiteSpaceChar = '\0';
        size_t cptr = 0;
        std::string idStr, intNumberStr, realNumberStr, reservedWordStr;

        if (i == 1) {
            line0 = line1.c_str();
        }
        else if (i == 2) {
            line0 = line2.c_str();
        }
        else if (i == 3) {
            line0 = line3.c_str();
        }
        else if (i == 4) {
            line0 = line4.c_str();
        }
        while (cptr < strlen(line0) && ch != '\n') {
           if (LineScan(
                line0,
                arithmeticOp,
                bracket,
                endOfLine,
                id,
                intNumber,
                logicalOp,
                realNumber,
                reservedWord,
                whiteSpace,
                arithmeticOpChar,
                bracketChar,
                ch,
                endOfLineChar,
                whiteSpaceChar,
                cptr,
                idStr,
                intNumberStr,
                realNumberStr,
                reservedWordStr)) {
                if (id) {
                    ch = line0[cptr];
                    std::cout << "id = " << idStr << std::endl;
                    std::cout << "cptr = " << cptr << std::endl;
                    std::cout << "ch = " << ch << std::endl;
                    id = false;
                    idStr = "";
                }
                else if (intNumber) {
                    std::cout << "int number = " << intNumberStr << std::endl;
                    std::cout << "cptr = " << cptr << std::endl;
                    std::cout << "ch = " << ch << std::endl;
                    intNumber = false;
                    intNumberStr = "";
                }
                else if (realNumber) {
                    std::cout << "real number = " << realNumberStr << std::endl;
                    std::cout << "cptr = " << cptr << std::endl;
                    std::cout << "ch = " << ch << std::endl;
                    realNumber = false;
                    realNumberStr = "";
                }
                else if (reservedWord) {
                    std::cout << "reserved word = " << reservedWordStr << std::endl;
                    std::cout << "cptr = " << cptr << std::endl;
                    std::cout << "ch = " << ch << std::endl;
                    reservedWord = false;
                    reservedWordStr = "";
                }
                else {
                    if (arithmeticOp) {
                        arithmeticOpChar = ch = line0[cptr++];
                        std::cout << "Arithmetic operator character = ";
                        std::cout << arithmeticOpChar << std::endl;
                    }
                }
                if (ch == '\0' || ch == '\n' || cptr >= strlen(line0)) {
                    break;
                }
            }
        }
    }

    return 0;
}
Unknown's avatar

Author: jamespatewilliamsjr

My whole legal name is James Pate Williams, Jr. I was born in LaGrange, Georgia approximately 70 years ago. I barely graduated from LaGrange High School with low marks in June 1971. Later in June 1979, I graduated from LaGrange College with a Bachelor of Arts in Chemistry with a little over a 3 out 4 Grade Point Average (GPA). In the Spring Quarter of 1978, I taught myself how to program a Texas Instruments desktop programmable calculator and in the Summer Quarter of 1978 I taught myself Dayton BASIC (Beginner's All-purpose Symbolic Instruction Code) on LaGrange College's Data General Eclipse minicomputer. I took courses in BASIC in the Fall Quarter of 1978 and FORTRAN IV (Formula Translator IV) in the Winter Quarter of 1979. Professor Kenneth Cooper, a genius poly-scientist taught me a course in the Intel 8085 microprocessor architecture and assembly and machine language. We would hand assemble our programs and insert the resulting machine code into our crude wooden box computer which was designed and built by Professor Cooper. From 1990 to 1994 I earned a Bachelor of Science in Computer Science from LaGrange College. I had a 4 out of 4 GPA in the period 1990 to 1994. I took courses in C, COBOL, and Pascal during my BS work. After graduating from LaGrange College a second time in May 1994, I taught myself C++. In December 1995, I started using the Internet and taught myself client-server programming. I created a website in 1997 which had C and C# implementations of algorithms from the "Handbook of Applied Cryptography" by Alfred J. Menezes, et. al., and some other cryptography and number theory textbooks and treatises.

Leave a comment