Blog Entry © Wednesday, December 24, 2025, by James Pate Williams, Jr. ID3 Decision Tree Metadata Parser

// ID3MetadataParser.cpp (c) December 2025
// by James Pate Williams, Jr.

#include "pch.h"

#define FILE_EOF			0
#define NO_ERROR			1
#define EMPTY_FILE			2
#define INVALID_LINE		3
#define MISSING_NAME		4
#define INVALID_NAME		5
#define INVALID_DESCRIPTION 6
#define MISSING_DESCRIPTION 7
#define INVALID_TYPE		8
#define MISSING_TYPE		9
#define INVALID_RANGE		10
#define INVALID_CATEGORICAL	11
#define INVALID_DOUBLE		12
#define INVALID_FLOAT		13
#define INVALID_INTEGER		14
#define INVALID_ROLE		15
#define MISSING_ROLE		16

enum AttributeType {
	categorical, integer, doubleReal, FloatReal
};

typedef struct tagCategoricalAttribute {
	std::string name = "";
	std::string description = "";
	std::vector<char> category;
} CategoricalAttribute, * PCategoricalAttribute;

typedef struct tagIntegerAttribute {
	std::string name = "";
	std::string description = "";
	int loValue= -1, hiValue = -1;
} IntegerAttribute, * PIntegerAttribute;

typedef struct tagDoubleAttribute {
	std::string name = "";
	std::string description = "";
	double loValue = -1.0, hiValue = -1.0;
} DoubleAttribute, * PDoubleAttribute;

typedef struct tagFloatAttribute {
	std::string name = "";
	std::string description = "";
	float loValue = -1.0f, hiValue = -1.0f;
} FloatAttribute, * PFloatAttribute;

static bool parseName(
	char line[],
	int length,
	int& errorCode,
	int& index,
	bool& feature,
	std::string& name)
{
	char* cptr1 = std::strstr(line, "#name: feature ");
	char* cptr2 = std::strstr(line, "#name: target ");

	if (cptr1 == nullptr && cptr2 == nullptr) {
		errorCode = MISSING_NAME;
		return false;
	}

	if (cptr1) {
		feature = true;
		index = static_cast<int>(strlen("#name: feature "));
	}

	else if (cptr2) {
		feature = false;
		index = static_cast<int>(strlen("#name: target "));
	}

	else {
		errorCode = INVALID_NAME;
		return false;
	}

	if (index >= static_cast<int>(strlen(line))) {
		errorCode = INVALID_NAME;
		return false;
	}

	if (line[index] >= L'A' && line[index] <= 'Z' ||
		line[index] >= L'a' && line[index] <= 'z') {
		bool first = true;

		name = "";

		while (index < strlen(line)) {
			if (line[index] >= 'A' && line[index] <= 'Z' ||
				line[index] >= 'a' && line[index] <= 'z' ||
				line[index] == ' ') {
				if (first)
					name += line[index++];
				else if (first &&
					line[index] >= '0' &&
					line[index] <= '9') {
					first = false;
					name += line[index++];
				}

				if (!first)
					name += line[index++];
			}

			else if (!first) {
				errorCode = INVALID_NAME;
				return false;
			}
		}
	}

	errorCode = 0;
	index = length;
	return true;
}

static bool parseDescription(
	char line[],
	int length,
	int& errorCode,
	int& index,
	std::string& description) {
	
	char* cptr = std::strstr(line, "#description: ");

	if (cptr == nullptr) {
		errorCode = MISSING_DESCRIPTION;
		return false;
	}

	int lengthDesc = static_cast<int>(
		strlen("#description: "));

	if (lengthDesc == length) {
		errorCode = INVALID_DESCRIPTION;
		return false;
	}

	index = lengthDesc;

	while (index < length)
		description += line[index++];
	
	errorCode = 0;
	return true;
}

static bool parseCategorical(
	char line[],
	int length,
	int& errorCode,
	int& index,
	std::vector<char>& category) {
	int delta = static_cast<int>(strlen("#type: categorical: {"));
	char* cptr = line + delta - 1;
	char ch = *cptr++;

	while (ch != '}' && index < length) {
		while (ch != ',' && index < length) {
			
			if (ch == '}') {
				if (index == length - 1)
					break;
				
				else {
					errorCode = INVALID_TYPE;
					return false;
				}

			}
			
			category.push_back(ch);
			index++;
			break;
		}

		cptr++;
		ch = *cptr;
	}

	if (category.size() != 0 && ch == '}') {
		errorCode = 0;
		return true;
	}

	else {
		errorCode = INVALID_CATEGORICAL;
		return false;
	}
}

static bool parseDoubleRange(
	char line[],
	int length,
	int& errorCode,
	int& index,
	double& hiDouble,
	double& loDouble)
{
	index = static_cast<int>(strlen("#type: doubleReal ["));
	char ch = line[index++];
	std::string doubleStr;

	while (ch != ',' &&
		index < static_cast<int>(strlen(line))) {
		doubleStr.push_back(ch);
		ch = line[index++];
	}

	if (doubleStr.size() == 0) {
		errorCode = INVALID_DOUBLE;
		return false;
	}

	try {
		loDouble = std::stod(doubleStr);
		doubleStr = "";
		ch = line[index++];

		while (ch != ']' && index < strlen(line)) {
			doubleStr.push_back(ch);
			ch = line[index++];
		}

		if (doubleStr.size() == 0) {
			errorCode = INVALID_DOUBLE;
			return false;
		}

		hiDouble = std::stod(doubleStr);
		errorCode = 0;
		return true;
	}
	catch (std::exception ex) {
		errorCode = INVALID_DOUBLE;
		return false;
	}

	errorCode = INVALID_RANGE;
	return false;
}

static bool parseFloatRange(
	char line[],
	int length,
	int& errorCode,
	int& index,
	float& hiFloat,
	float& loFloat)
{
	char ch = '\0';
	std::string floatStr;
	ch = line[index++];

	while (ch != ',' && index < strlen(line)) {
		floatStr.push_back(ch);
		ch = line[index++];
	}

	if (floatStr.size() == 0) {
		errorCode = INVALID_INTEGER;
		return false;
	}

	else {
		try {
			loFloat = std::stof(floatStr);
			floatStr = "";
			ch = line[++index];

			while (ch != ']' && index < strlen(line)) {
				floatStr.push_back(ch);
				ch = line[index++];
			}

			if (floatStr.size() == 0) {
				errorCode = INVALID_FLOAT;
				return false;
			}

			hiFloat = std::stof(floatStr);
			errorCode = 0;
			return true;
		}
		catch (std::exception ex) {
			errorCode = INVALID_FLOAT;
			return false;
		}
	}

	errorCode = INVALID_RANGE;
	return false;
}

static bool parseIntegerRange(
	char line[],
	int length,
	int& errorCode,
	int& index,
	int& hiInteger,
	int& loInteger) {
	char ch = '\0';
	int i = 0;
	std::string integerStr;
	
	ch = line[i++];

	if (ch < '0' || ch > '9') {
		errorCode = INVALID_INTEGER;
		return false;
	}

	while (ch != ',' &&	index < length) {
		integerStr.push_back(ch);
		ch = line[i++];
		index++;
	}

	if (integerStr.size() == 0) {
		errorCode = INVALID_INTEGER;
		return false;
	}

	else {
		try {
			loInteger = std::stoi(integerStr);
			integerStr = "";
			i = 0;
			ch = line[i++];
			ch = line[i++];
			ch = line[i++];
			index += 3;

			while (
				ch >= '0' && ch <= '9' &&
				ch != ']' && index < length) {
				integerStr.push_back(ch);
				ch = line[i++];
				index++;
			}

			if (integerStr.size() == 0) {
				errorCode = INVALID_INTEGER;
				return false;
			}

			hiInteger = std::stoi(integerStr);
			errorCode = 0;
			return true;
		}
		catch (std::exception ex) {
			errorCode = INVALID_INTEGER;
			return false;
		}
	}

	errorCode = INVALID_RANGE;
	return false;
}

static bool parseType(
	char line[],
	int length,
	double& hiDouble,
	double& loDouble,
	float& hiFloat,
	float& loFloat,
	int& errorCode,
	int& index,
	int& hiInteger,
	int& loInteger,
	std::string& type,
	std::vector<char>& alphabet) {

	char* cptr = std::strstr(line, "#type: ");

	if (cptr == nullptr) {
		errorCode = MISSING_TYPE;
		return false;
	}

	int lengthType = static_cast<int>(strlen("#type: "));

	if (lengthType >= length) {
		errorCode = INVALID_TYPE;
		return false;
	}

	index = lengthType;
	cptr = line + index;

	if (std::strstr(cptr, "categorical {") != nullptr) {
		if (parseCategorical(line, length, errorCode,
			index, alphabet)) {
			errorCode = 0;
			type = "categorical";
			return true;
		}

		else {
			errorCode = INVALID_CATEGORICAL;
			return false;
		}
	}

	if (std::strstr(cptr, "integer [") != nullptr) {
		bool pir = parseIntegerRange(
			line + index + strlen("integer ["),
			length,
			errorCode,
			index,
			hiInteger,
			loInteger);
		if (pir) {
			type = "integer";
			return true;
		}

		else
			return false;
	}

	if (std::strstr(cptr, "doubleReal [") != nullptr) {
		bool pdr = parseDoubleRange(
			line,
			length,
			errorCode,
			index,
			hiDouble,
			loDouble);

		if (pdr) {
			type = "doubleReal";
			return true;
		}

		else
			return false;
	}

	if (std::strstr(cptr, "floatReal [") != nullptr) {
		bool pfr = parseFloatRange(
			line,
			length,
			errorCode,
			index,
			hiFloat,
			loFloat);

		if (pfr) {
			type = "floatReal";
			return true;
		}
	}

	errorCode = INVALID_TYPE;
	return false;
}

static bool readMetaDataLine(
	std::ifstream& file1,
	char line[],
	int& errorCode,
	int& index) {
	file1.getline(line, 256);

	if (strlen(line) == 0 && index == -1) {
		errorCode = EMPTY_FILE;
		return false;
	}

	if (file1.eof()) {
		errorCode = 0;
		index = FILE_EOF;
		return false;
	}

	if (strlen(line) > 0 &&
		std::strstr(line, "#endheader") != nullptr)
		return false;

	if (strlen(line) > 0)
		return true;
	else
		return false;
}

double dbl_max[8] = { 0 };
double dbl_min[8] = { 0 };
int int_max = 0;
int int_min = 0;

static void readDatasetFile(
	std::ifstream& file2) {
	char line[256] = "";

	for (int i = 0; i < 8; i++) {
		dbl_min[i] = DBL_MAX;
		dbl_max[i] = DBL_MIN;
	}

	int_min = INT_MAX;
	int_max = INT_MIN;

	while (!file2.eof()) {
		file2.getline(line, 256);
		int count = 0, index = 0;

		while (
			count <= 9 &&
			index < static_cast<int>(strlen(line))) {
			char ch = line[index++], subline[256] = "";
			int cp = 0;

			while (ch != ',' && cp < static_cast<int>(strlen(line))) {
				subline[cp++] = ch;
				ch = line[index++];
			}

			count++;

			if (strlen(subline) >= 1)
				subline[cp] = '\0';

			if (count >= 1 && count <= 8 && cp > 1) {
				std::string substr(subline);
				double x = std::stod(subline);

				if (x > dbl_max[count - 1])
					dbl_max[count - 1] = x;
				if (x < dbl_min[count - 1])
					dbl_min[count - 1] = x;
			}

			else if (count == 9 && !(
				strstr(subline, "F") ||
				strstr(subline, "I") ||
				strstr(subline, "M"))) {
				std::string substr(subline);
				int x = std::stoi(substr);

				if (x > int_max)
					int_max = x;
				if (x < int_min)
					int_min = x;
			}
		}
	}

	file2.close();
}

int main()
{
	bool feature = false;
	char filename1[256] = "C:\\Users\\James\\OneDrive\\Desktop\\ID3MetadataParser\\x64\\Debug\\ID3MetadataParserDataFile.txt";
	char filename2[256] = "C:\\Users\\James\\OneDrive\\Desktop\\ID3MetadataParser\\x64\\Debug\\abalone.data.txt";
	char line[256] = "";
	int errorCode = -1, index = -1, role = -1;
	std::ifstream file1(filename1);
	std::ifstream file2(filename2);

	// file1 format
	std::string name, description, type;
	std::vector<char> category;
	
	// file2 format
	std::string cat, length, diameter, height, whole;
	std::string shucked, viscera, shell, rings;
	
	std::vector<CategoricalAttribute> categoricalAttributes;
	std::vector<IntegerAttribute> integerAttributes;
	std::vector<DoubleAttribute> doubleAttributes;
	std::vector<FloatAttribute> floatAttributes;

	std::vector<std::string> names;
	std::vector<std::string> descriptions;
	std::vector<std::string> types;

	while (!file1.eof()) {
		index = -1;

		bool result = readMetaDataLine(
			file1,
			line,
			errorCode,
			index);

		if (!result)
			break;

		index = 0;
		int length = static_cast<int>(strlen(line));

		if (length == 0)
			break;
		
		name = "";

		bool pn = parseName(
			line,
			length,
			errorCode,
			index,
			feature,
			name);

		if (pn) {
			bool result = readMetaDataLine(
				file1,
				line,
				errorCode,
				index);

			if (!result)
				break;
		
			length = static_cast<int>(strlen(line));
			index = 0;
			description = "";

			bool pd = parseDescription(
				line,
				length,
				errorCode,
				index,
				description);

			if (pd) {
				bool result = readMetaDataLine(
					file1,
					line,
					errorCode,
					index);

				if (!result)
					break;

				length = static_cast<int>(strlen(line));
				index = 0;
				type = "";

				double hiDouble = DBL_MIN;
				double loDouble = DBL_MAX;
				float hiFloat = FLT_MIN;
				float loFloat = FLT_MAX;
				int hiInteger = INT_MIN;
				int loInteger = INT_MAX;

				bool pt = parseType(
					line,
					length,
					hiDouble,
					loDouble,
					hiFloat,
					loFloat,
					errorCode,
					index,
					hiInteger,
					loInteger,
					type,
					category);
					length = static_cast<int>(strlen(line));

				if (pt) {
					if (type == "categorical") {
						CategoricalAttribute ca;
						ca.category = category;
						ca.description = description;
						ca.name = name;
						categoricalAttributes.push_back(ca);
					}

					else if (type == "integer") {
						IntegerAttribute ia;
						ia.loValue = loInteger;
						ia.hiValue = hiInteger;
						ia.description = description;
						ia.name = name;
						integerAttributes.push_back(ia);
					}

					else if (type == "doubleReal") {
						DoubleAttribute da;
						da.loValue = loDouble;
						da.hiValue = hiDouble;
						da.description = description;
						da.name = name;
						doubleAttributes.push_back(da);
					}

					else if (type == "floatReal") {
						FloatAttribute fa;
						fa.loValue = loFloat;
						fa.hiValue = hiFloat;
						fa.description = description;
						fa.name = name;
						floatAttributes.push_back(fa);
					}

					else {
						errorCode = INVALID_TYPE;
						break;
					}
				}

				else {
					errorCode = MISSING_TYPE;
					break;
				}
			}

			else {
				errorCode = INVALID_DESCRIPTION;
				break;
			}
		}

		else {
			errorCode = INVALID_NAME;
			return false;
		}
	}

	readDatasetFile(file2);

	for (int i = 1; i <= 7; i++) {
		std::cout << i << '\t' << dbl_min[i];
		std::cout << '\t' << dbl_max[i];
		std::cout << std::endl;
	}

	std::cout << "8\t" << int_min << '\t' << int_max;
	std::cout << std::endl;
	std::cout << std::endl;

	for (int i = 0; i < static_cast<int>(categoricalAttributes.size()); i++) {
		std::cout << categoricalAttributes[i].name << ' ';
		std::cout << categoricalAttributes[i].description << ' ';
		std::cout << std::endl;
	}

	for (int i = 0; i < static_cast<int>(doubleAttributes.size()); i++) {
		std::cout << doubleAttributes[i].name << ' ';
		std::cout << doubleAttributes[i].description << ' ';
		std::cout << doubleAttributes[i].loValue << ' ';
		std::cout << doubleAttributes[i].hiValue;
		std::cout << std::endl;
	}

	for (int i = 0; i < static_cast<int>(floatAttributes.size()); i++) {
		std::cout << floatAttributes[i].name << ' ';
		std::cout << floatAttributes[i].description << ' ';
		std::cout << floatAttributes[i].loValue << ' ';
		std::cout << floatAttributes[i].hiValue;
		std::cout << std::endl;
	}

	for (int i = 0; i < static_cast<int>(integerAttributes.size()); i++) {
		std::cout << integerAttributes[i].name << ' ';
		std::cout << integerAttributes[i].description << ' ';
		std::cout << integerAttributes[i].loValue << ' ';
		std::cout << integerAttributes[i].hiValue;
		std::cout << std::endl;
	}

	file1.close();
	return 0;
}