// Lexical.cpp
// Copyright (c) Friday, April 4, 2025, by
// James Pate Williams, Jr.
// Reference: "Modern Compiler Implementation
// in Java Second Edition" (c) 2002
// Andrew W. Appel with Jens Palsberg
// Chapter Two, Lexical Analysis
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
std::vector<char> AlphabeticChars = { 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e',
'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };
std::vector<char> BracketChars = { '(', ')', '{', '}', '[', ']' };
std::vector<char> ArithmeticOps = { '+', '-', '*', '/', '%' };
std::vector<char> LogicalOps = { '&', '^', '|' };
std::vector<char> DecimalDigits = { '0', '1', '2', '4', '5', '6',
'7', '8', '9' };
std::vector<char> WhiteSpace = { ' ', '\t', '\n' };
std::vector<std::string> ReservedWords = {
"do", "double", "else", "for", "if", "int", "long", "while" };
char reservedWordsAscii[8][8] = { { } };
std::vector<char> AlphaNumericChars;
bool TokenScan(
std::string wline,
bool& arithmeticOp,
bool& bracket,
bool& endOfLine,
bool& id,
bool& intNumber,
bool& logicalOp,
bool& realNumber,
bool& reservedWord,
bool& whiteSpace,
char& arithmeticOpChar,
char& bracketChar,
char& ch,
char& endOfLineChar,
char& whiteSpaceChar,
size_t& cptr,
std::string& idStr,
std::string& intNumberStr,
std::string& realNumberStr,
std::string& reservedWordStr) {
const char* line = wline.c_str();
ch = line[cptr];
while (ch == ' ' || ch == '\t' || ch == '\n' && cptr < strlen(line)) {
cptr++;
ch = line[cptr];
}
if (ch == '\n' || cptr == strlen(line)) {
endOfLine = true;
endOfLineChar = '\n';
return true;
}
ch = line[cptr];
if (ch >= 'A' && ch <= 'Z' ||
ch >= 'a' && ch <= 'z') {
idStr += ch;
cptr++;
while (cptr < strlen(line)) {
ch = line[cptr];
if (ch >= '0' && ch <= '9' ||
ch >= 'A' && ch <= 'Z' ||
ch >= 'a' && ch <= 'z' ||
ch == '_') {
idStr += ch;
cptr++;
}
else if (ch == ' ' || ch == '\t' || ch == '\n') {
cptr++;
break;
}
else {
id = false;
return false;
}
}
id = true;
char reservedWordAscii[8] = { };
reservedWordStr = "";
for (size_t i = 0; i < strlen(idStr.c_str()); i++) {
reservedWordAscii[i] = idStr.c_str()[i];
}
reservedWordAscii[strlen(idStr.c_str())] = '\0';
for (size_t i = 0; i < 8; i++) {
size_t count = 0;
reservedWord = false;
reservedWordStr = "";
for (size_t j = 0; j < strlen(reservedWordsAscii[i]); j++) {
if (reservedWordAscii[j] == reservedWordsAscii[i][j]) {
reservedWord = true;
reservedWordStr += idStr[j];
count++;
}
}
if (reservedWord && count == strlen(reservedWordsAscii[i])) {
break;
}
}
if (reservedWord) {
id = false;
}
if (id) {
id = true;
reservedWord = false;
reservedWordStr = "";
}
else {
id = false;
idStr = "";
}
return id || reservedWord;
}
else if (ch >= '0' && ch <= '9')
{
intNumberStr = "";
realNumberStr = "";
intNumberStr += ch;
realNumberStr += ch;
cptr++;
ch = line[cptr];
if (ch >= '0' && ch <= '9') {
intNumberStr += ch;
cptr++;
ch = line[cptr];
while (cptr < strlen(line)) {
ch = line[cptr];
if (ch >= '0' && ch <= '9') {
intNumberStr += ch;
cptr++;
}
else if (ch == ' ' || ch == '\t' || ch == '\n') {
cptr++;
break;
}
else {
intNumber = false;
return false;
}
}
intNumber = true;
return true;
}
else if (ch == '.') {
realNumberStr += ch;
cptr++;
ch = line[cptr];
if (ch >= '0' && ch <= '9') {
realNumberStr += ch;
cptr++;
while (cptr < (int)strlen(line)) {
ch = line[cptr];
if (ch >= '0' && ch <= '9') {
realNumberStr += ch;
cptr++;
}
else if (ch == ' ' || ch == '\t' || ch == '\n') {
cptr++;
break;
}
else {
realNumber = false;
return false;
}
}
realNumber = true;
return true;
}
}
}
else if (ch == '.') {
realNumberStr += ch;
cptr++;
ch = line[cptr];
if (ch >= '0' && ch <= '9') {
realNumberStr += ch;
cptr++;
while (cptr < (int)strlen(line)) {
ch = line[cptr];
if (ch >= '0' && ch <= '9') {
realNumberStr += ch;
cptr++;
}
else if (ch == ' ' || ch == '\t' || ch == '\n') {
cptr++;
break;
}
else {
realNumber = false;
return false;
}
}
realNumber = true;
return true;
}
else {
realNumber = false;
return false;
}
}
return false;
}
bool LineScan(
std::string wline,
bool& arithmeticOp,
bool& bracket,
bool& endOfLine,
bool& id,
bool& intNumber,
bool& logicalOp,
bool& realNumber,
bool& reservedWord,
bool& whiteSpace,
char& arithmeticOpChar,
char& bracketChar,
char& ch,
char& endOfLineChar,
char& whiteSpaceChar,
size_t& cptr,
std::string& idStr,
std::string& intNumberStr,
std::string& realNumberStr,
std::string& reservedWordStr)
{
const char* line = wline.c_str();
ch = line[cptr];
while (ch == ' ' || ch == '\t' || ch == '\n' && cptr < strlen(line)) {
cptr++;
ch = line[cptr];
}
if (ch == '\n' || cptr == strlen(line)) {
endOfLine = true;
endOfLineChar = '\n';
return true;
}
arithmeticOp = bracket = endOfLine = id = false;
intNumber = logicalOp = realNumber = whiteSpace = false;
arithmeticOpChar = '\0', bracketChar = '\0', ch = '\0';
endOfLineChar = '\0', whiteSpaceChar = '\0';
if (TokenScan(
line,
arithmeticOp,
bracket,
endOfLine,
id,
intNumber,
logicalOp,
realNumber,
reservedWord,
whiteSpace,
arithmeticOpChar,
bracketChar,
ch,
endOfLineChar,
whiteSpaceChar,
cptr,
idStr,
intNumberStr,
realNumberStr,
reservedWordStr)) {
if (id || intNumber || realNumber || reservedWord) {
return true;
}
if (endOfLine) {
return true;
}
ch = line[cptr];
if (ch == ' ' || ch == '\t' || ch == '\n') {
cptr++;
ch = line[cptr];
}
else if (ch == '\n') {
cptr++;
ch = line[cptr];
endOfLine = true;
return true;
}
}
else if (ch == '\\') {
cptr++;
if (cptr == strlen(line)) {
return false;
}
ch = line[cptr];
if (ch == '\\') {
// found a one-line comment
// skip until end-of-line
cptr++;
ch = line[cptr];
while (cptr < strlen(line) && ch != '\n') {
ch = line[cptr++];
}
return ch == '\n';
}
else {
// single \ found
return false;
}
}
const auto itao = std::find(ArithmeticOps.begin(), ArithmeticOps.end(), ch);
if (itao != ArithmeticOps.end()) {
arithmeticOp = true;
return true;
}
const auto itbc = std::find(BracketChars.begin(), BracketChars.end(), ch);
if (itbc != BracketChars.end()) {
bracketChar = true;
return true;
}
const auto itlo = std::find(LogicalOps.begin(), LogicalOps.end(), ch);
if (itlo != LogicalOps.end()) {
logicalOp = true;
return true;
}
const auto itdd = std::find(DecimalDigits.begin(), DecimalDigits.end(), ch);
if (itdd != DecimalDigits.end()) {
intNumberStr += ch;
cptr++;
ch = line[cptr];
while (ch >= '0' && ch <= '9') {
intNumberStr += ch;
}
const auto iws = std::find(WhiteSpace.begin(), WhiteSpace.end(), ch);
if (iws != WhiteSpace.end()) {
intNumber = intNumberStr.size() > 1;
whiteSpace = true;
return true;
}
else {
return false;
}
}
return false;
}
int main()
{
for (size_t i = 0; i < AlphabeticChars.size(); i++) {
AlphaNumericChars.push_back(AlphabeticChars[i]);
}
for (size_t i = 0; i < DecimalDigits.size(); i++) {
AlphaNumericChars.push_back(DecimalDigits[i]);
}
for (size_t i = 0; i < 8; i++) {
for (size_t j = 0; j < strlen(ReservedWords[i].c_str()); j++) {
reservedWordsAscii[i][j] = ReservedWords[i].c_str()[j];
}
reservedWordsAscii[i][strlen(ReservedWords[i].c_str())] = '\0';
}
const char* line0 = { };
std::string line1 = "abc4 def_5 amp c1\n";
std::string line2 = ".1234 0.4567 9876 while for\n";
std::string line3 = "\\\\this is a one-line comment\n";
std::string line4 = "a + b / c\n";
for (int i = 1; i <= 4; i++) {
bool arithmeticOp = false, bracket = false, endOfLine = false;
bool id = false, intNumber = false, logicalOp = false, realNumber = false;
bool reservedWord = false, whiteSpace = false;
char arithmeticOpChar = '\0', bracketChar = '\0', ch = '\0';
char endOfLineChar = '\0', logicalOpChar = '\0', whiteSpaceChar = '\0';
size_t cptr = 0;
std::string idStr, intNumberStr, realNumberStr, reservedWordStr;
if (i == 1) {
line0 = line1.c_str();
}
else if (i == 2) {
line0 = line2.c_str();
}
else if (i == 3) {
line0 = line3.c_str();
}
else if (i == 4) {
line0 = line4.c_str();
}
while (cptr < strlen(line0) && ch != '\n') {
if (LineScan(
line0,
arithmeticOp,
bracket,
endOfLine,
id,
intNumber,
logicalOp,
realNumber,
reservedWord,
whiteSpace,
arithmeticOpChar,
bracketChar,
ch,
endOfLineChar,
whiteSpaceChar,
cptr,
idStr,
intNumberStr,
realNumberStr,
reservedWordStr)) {
if (id) {
ch = line0[cptr];
std::cout << "id = " << idStr << std::endl;
std::cout << "cptr = " << cptr << std::endl;
std::cout << "ch = " << ch << std::endl;
id = false;
idStr = "";
}
else if (intNumber) {
std::cout << "int number = " << intNumberStr << std::endl;
std::cout << "cptr = " << cptr << std::endl;
std::cout << "ch = " << ch << std::endl;
intNumber = false;
intNumberStr = "";
}
else if (realNumber) {
std::cout << "real number = " << realNumberStr << std::endl;
std::cout << "cptr = " << cptr << std::endl;
std::cout << "ch = " << ch << std::endl;
realNumber = false;
realNumberStr = "";
}
else if (reservedWord) {
std::cout << "reserved word = " << reservedWordStr << std::endl;
std::cout << "cptr = " << cptr << std::endl;
std::cout << "ch = " << ch << std::endl;
reservedWord = false;
reservedWordStr = "";
}
else {
if (arithmeticOp) {
arithmeticOpChar = ch = line0[cptr++];
std::cout << "Arithmetic operator character = ";
std::cout << arithmeticOpChar << std::endl;
}
}
if (ch == '\0' || ch == '\n' || cptr >= strlen(line0)) {
break;
}
}
}
}
return 0;
}