Blog Entry © Wednesday, December 24, 2025, by James Pate Williams, Jr. ID3 Decision Tree Metadata Parser

// ID3MetadataParser.cpp (c) December 2025
// by James Pate Williams, Jr.

#include "pch.h"

#define FILE_EOF			0
#define NO_ERROR			1
#define EMPTY_FILE			2
#define INVALID_LINE		3
#define MISSING_NAME		4
#define INVALID_NAME		5
#define INVALID_DESCRIPTION 6
#define MISSING_DESCRIPTION 7
#define INVALID_TYPE		8
#define MISSING_TYPE		9
#define INVALID_RANGE		10
#define INVALID_CATEGORICAL	11
#define INVALID_DOUBLE		12
#define INVALID_FLOAT		13
#define INVALID_INTEGER		14
#define INVALID_ROLE		15
#define MISSING_ROLE		16

enum AttributeType {
	categorical, integer, doubleReal, FloatReal
};

typedef struct tagCategoricalAttribute {
	std::string name = "";
	std::string description = "";
	std::vector<char> category;
} CategoricalAttribute, * PCategoricalAttribute;

typedef struct tagIntegerAttribute {
	std::string name = "";
	std::string description = "";
	int loValue= -1, hiValue = -1;
} IntegerAttribute, * PIntegerAttribute;

typedef struct tagDoubleAttribute {
	std::string name = "";
	std::string description = "";
	double loValue = -1.0, hiValue = -1.0;
} DoubleAttribute, * PDoubleAttribute;

typedef struct tagFloatAttribute {
	std::string name = "";
	std::string description = "";
	float loValue = -1.0f, hiValue = -1.0f;
} FloatAttribute, * PFloatAttribute;

static bool parseName(
	char line[],
	int length,
	int& errorCode,
	int& index,
	bool& feature,
	std::string& name)
{
	char* cptr1 = std::strstr(line, "#name: feature ");
	char* cptr2 = std::strstr(line, "#name: target ");

	if (cptr1 == nullptr && cptr2 == nullptr) {
		errorCode = MISSING_NAME;
		return false;
	}

	if (cptr1) {
		feature = true;
		index = static_cast<int>(strlen("#name: feature "));
	}

	else if (cptr2) {
		feature = false;
		index = static_cast<int>(strlen("#name: target "));
	}

	else {
		errorCode = INVALID_NAME;
		return false;
	}

	if (index >= static_cast<int>(strlen(line))) {
		errorCode = INVALID_NAME;
		return false;
	}

	if (line[index] >= L'A' && line[index] <= 'Z' ||
		line[index] >= L'a' && line[index] <= 'z') {
		bool first = true;

		name = "";

		while (index < strlen(line)) {
			if (line[index] >= 'A' && line[index] <= 'Z' ||
				line[index] >= 'a' && line[index] <= 'z' ||
				line[index] == ' ') {
				if (first)
					name += line[index++];
				else if (first &&
					line[index] >= '0' &&
					line[index] <= '9') {
					first = false;
					name += line[index++];
				}

				if (!first)
					name += line[index++];
			}

			else if (!first) {
				errorCode = INVALID_NAME;
				return false;
			}
		}
	}

	errorCode = 0;
	index = length;
	return true;
}

static bool parseDescription(
	char line[],
	int length,
	int& errorCode,
	int& index,
	std::string& description) {
	
	char* cptr = std::strstr(line, "#description: ");

	if (cptr == nullptr) {
		errorCode = MISSING_DESCRIPTION;
		return false;
	}

	int lengthDesc = static_cast<int>(
		strlen("#description: "));

	if (lengthDesc == length) {
		errorCode = INVALID_DESCRIPTION;
		return false;
	}

	index = lengthDesc;

	while (index < length)
		description += line[index++];
	
	errorCode = 0;
	return true;
}

static bool parseCategorical(
	char line[],
	int length,
	int& errorCode,
	int& index,
	std::vector<char>& category) {
	int delta = static_cast<int>(strlen("#type: categorical: {"));
	char* cptr = line + delta - 1;
	char ch = *cptr++;

	while (ch != '}' && index < length) {
		while (ch != ',' && index < length) {
			
			if (ch == '}') {
				if (index == length - 1)
					break;
				
				else {
					errorCode = INVALID_TYPE;
					return false;
				}

			}
			
			category.push_back(ch);
			index++;
			break;
		}

		cptr++;
		ch = *cptr;
	}

	if (category.size() != 0 && ch == '}') {
		errorCode = 0;
		return true;
	}

	else {
		errorCode = INVALID_CATEGORICAL;
		return false;
	}
}

static bool parseDoubleRange(
	char line[],
	int length,
	int& errorCode,
	int& index,
	double& hiDouble,
	double& loDouble)
{
	index = static_cast<int>(strlen("#type: doubleReal ["));
	char ch = line[index++];
	std::string doubleStr;

	while (ch != ',' &&
		index < static_cast<int>(strlen(line))) {
		doubleStr.push_back(ch);
		ch = line[index++];
	}

	if (doubleStr.size() == 0) {
		errorCode = INVALID_DOUBLE;
		return false;
	}

	try {
		loDouble = std::stod(doubleStr);
		doubleStr = "";
		ch = line[index++];

		while (ch != ']' && index < strlen(line)) {
			doubleStr.push_back(ch);
			ch = line[index++];
		}

		if (doubleStr.size() == 0) {
			errorCode = INVALID_DOUBLE;
			return false;
		}

		hiDouble = std::stod(doubleStr);
		errorCode = 0;
		return true;
	}
	catch (std::exception ex) {
		errorCode = INVALID_DOUBLE;
		return false;
	}

	errorCode = INVALID_RANGE;
	return false;
}

static bool parseFloatRange(
	char line[],
	int length,
	int& errorCode,
	int& index,
	float& hiFloat,
	float& loFloat)
{
	char ch = '\0';
	std::string floatStr;
	ch = line[index++];

	while (ch != ',' && index < strlen(line)) {
		floatStr.push_back(ch);
		ch = line[index++];
	}

	if (floatStr.size() == 0) {
		errorCode = INVALID_INTEGER;
		return false;
	}

	else {
		try {
			loFloat = std::stof(floatStr);
			floatStr = "";
			ch = line[++index];

			while (ch != ']' && index < strlen(line)) {
				floatStr.push_back(ch);
				ch = line[index++];
			}

			if (floatStr.size() == 0) {
				errorCode = INVALID_FLOAT;
				return false;
			}

			hiFloat = std::stof(floatStr);
			errorCode = 0;
			return true;
		}
		catch (std::exception ex) {
			errorCode = INVALID_FLOAT;
			return false;
		}
	}

	errorCode = INVALID_RANGE;
	return false;
}

static bool parseIntegerRange(
	char line[],
	int length,
	int& errorCode,
	int& index,
	int& hiInteger,
	int& loInteger) {
	char ch = '\0';
	int i = 0;
	std::string integerStr;
	
	ch = line[i++];

	if (ch < '0' || ch > '9') {
		errorCode = INVALID_INTEGER;
		return false;
	}

	while (ch != ',' &&	index < length) {
		integerStr.push_back(ch);
		ch = line[i++];
		index++;
	}

	if (integerStr.size() == 0) {
		errorCode = INVALID_INTEGER;
		return false;
	}

	else {
		try {
			loInteger = std::stoi(integerStr);
			integerStr = "";
			i = 0;
			ch = line[i++];
			ch = line[i++];
			ch = line[i++];
			index += 3;

			while (
				ch >= '0' && ch <= '9' &&
				ch != ']' && index < length) {
				integerStr.push_back(ch);
				ch = line[i++];
				index++;
			}

			if (integerStr.size() == 0) {
				errorCode = INVALID_INTEGER;
				return false;
			}

			hiInteger = std::stoi(integerStr);
			errorCode = 0;
			return true;
		}
		catch (std::exception ex) {
			errorCode = INVALID_INTEGER;
			return false;
		}
	}

	errorCode = INVALID_RANGE;
	return false;
}

static bool parseType(
	char line[],
	int length,
	double& hiDouble,
	double& loDouble,
	float& hiFloat,
	float& loFloat,
	int& errorCode,
	int& index,
	int& hiInteger,
	int& loInteger,
	std::string& type,
	std::vector<char>& alphabet) {

	char* cptr = std::strstr(line, "#type: ");

	if (cptr == nullptr) {
		errorCode = MISSING_TYPE;
		return false;
	}

	int lengthType = static_cast<int>(strlen("#type: "));

	if (lengthType >= length) {
		errorCode = INVALID_TYPE;
		return false;
	}

	index = lengthType;
	cptr = line + index;

	if (std::strstr(cptr, "categorical {") != nullptr) {
		if (parseCategorical(line, length, errorCode,
			index, alphabet)) {
			errorCode = 0;
			type = "categorical";
			return true;
		}

		else {
			errorCode = INVALID_CATEGORICAL;
			return false;
		}
	}

	if (std::strstr(cptr, "integer [") != nullptr) {
		bool pir = parseIntegerRange(
			line + index + strlen("integer ["),
			length,
			errorCode,
			index,
			hiInteger,
			loInteger);
		if (pir) {
			type = "integer";
			return true;
		}

		else
			return false;
	}

	if (std::strstr(cptr, "doubleReal [") != nullptr) {
		bool pdr = parseDoubleRange(
			line,
			length,
			errorCode,
			index,
			hiDouble,
			loDouble);

		if (pdr) {
			type = "doubleReal";
			return true;
		}

		else
			return false;
	}

	if (std::strstr(cptr, "floatReal [") != nullptr) {
		bool pfr = parseFloatRange(
			line,
			length,
			errorCode,
			index,
			hiFloat,
			loFloat);

		if (pfr) {
			type = "floatReal";
			return true;
		}
	}

	errorCode = INVALID_TYPE;
	return false;
}

static bool readMetaDataLine(
	std::ifstream& file1,
	char line[],
	int& errorCode,
	int& index) {
	file1.getline(line, 256);

	if (strlen(line) == 0 && index == -1) {
		errorCode = EMPTY_FILE;
		return false;
	}

	if (file1.eof()) {
		errorCode = 0;
		index = FILE_EOF;
		return false;
	}

	if (strlen(line) > 0 &&
		std::strstr(line, "#endheader") != nullptr)
		return false;

	if (strlen(line) > 0)
		return true;
	else
		return false;
}

double dbl_max[8] = { 0 };
double dbl_min[8] = { 0 };
int int_max = 0;
int int_min = 0;

static void readDatasetFile(
	std::ifstream& file2) {
	char line[256] = "";

	for (int i = 0; i < 8; i++) {
		dbl_min[i] = DBL_MAX;
		dbl_max[i] = DBL_MIN;
	}

	int_min = INT_MAX;
	int_max = INT_MIN;

	while (!file2.eof()) {
		file2.getline(line, 256);
		int count = 0, index = 0;

		while (
			count <= 9 &&
			index < static_cast<int>(strlen(line))) {
			char ch = line[index++], subline[256] = "";
			int cp = 0;

			while (ch != ',' && cp < static_cast<int>(strlen(line))) {
				subline[cp++] = ch;
				ch = line[index++];
			}

			count++;

			if (strlen(subline) >= 1)
				subline[cp] = '\0';

			if (count >= 1 && count <= 8 && cp > 1) {
				std::string substr(subline);
				double x = std::stod(subline);

				if (x > dbl_max[count - 1])
					dbl_max[count - 1] = x;
				if (x < dbl_min[count - 1])
					dbl_min[count - 1] = x;
			}

			else if (count == 9 && !(
				strstr(subline, "F") ||
				strstr(subline, "I") ||
				strstr(subline, "M"))) {
				std::string substr(subline);
				int x = std::stoi(substr);

				if (x > int_max)
					int_max = x;
				if (x < int_min)
					int_min = x;
			}
		}
	}

	file2.close();
}

int main()
{
	bool feature = false;
	char filename1[256] = "C:\\Users\\James\\OneDrive\\Desktop\\ID3MetadataParser\\x64\\Debug\\ID3MetadataParserDataFile.txt";
	char filename2[256] = "C:\\Users\\James\\OneDrive\\Desktop\\ID3MetadataParser\\x64\\Debug\\abalone.data.txt";
	char line[256] = "";
	int errorCode = -1, index = -1, role = -1;
	std::ifstream file1(filename1);
	std::ifstream file2(filename2);

	// file1 format
	std::string name, description, type;
	std::vector<char> category;
	
	// file2 format
	std::string cat, length, diameter, height, whole;
	std::string shucked, viscera, shell, rings;
	
	std::vector<CategoricalAttribute> categoricalAttributes;
	std::vector<IntegerAttribute> integerAttributes;
	std::vector<DoubleAttribute> doubleAttributes;
	std::vector<FloatAttribute> floatAttributes;

	std::vector<std::string> names;
	std::vector<std::string> descriptions;
	std::vector<std::string> types;

	while (!file1.eof()) {
		index = -1;

		bool result = readMetaDataLine(
			file1,
			line,
			errorCode,
			index);

		if (!result)
			break;

		index = 0;
		int length = static_cast<int>(strlen(line));

		if (length == 0)
			break;
		
		name = "";

		bool pn = parseName(
			line,
			length,
			errorCode,
			index,
			feature,
			name);

		if (pn) {
			bool result = readMetaDataLine(
				file1,
				line,
				errorCode,
				index);

			if (!result)
				break;
		
			length = static_cast<int>(strlen(line));
			index = 0;
			description = "";

			bool pd = parseDescription(
				line,
				length,
				errorCode,
				index,
				description);

			if (pd) {
				bool result = readMetaDataLine(
					file1,
					line,
					errorCode,
					index);

				if (!result)
					break;

				length = static_cast<int>(strlen(line));
				index = 0;
				type = "";

				double hiDouble = DBL_MIN;
				double loDouble = DBL_MAX;
				float hiFloat = FLT_MIN;
				float loFloat = FLT_MAX;
				int hiInteger = INT_MIN;
				int loInteger = INT_MAX;

				bool pt = parseType(
					line,
					length,
					hiDouble,
					loDouble,
					hiFloat,
					loFloat,
					errorCode,
					index,
					hiInteger,
					loInteger,
					type,
					category);
					length = static_cast<int>(strlen(line));

				if (pt) {
					if (type == "categorical") {
						CategoricalAttribute ca;
						ca.category = category;
						ca.description = description;
						ca.name = name;
						categoricalAttributes.push_back(ca);
					}

					else if (type == "integer") {
						IntegerAttribute ia;
						ia.loValue = loInteger;
						ia.hiValue = hiInteger;
						ia.description = description;
						ia.name = name;
						integerAttributes.push_back(ia);
					}

					else if (type == "doubleReal") {
						DoubleAttribute da;
						da.loValue = loDouble;
						da.hiValue = hiDouble;
						da.description = description;
						da.name = name;
						doubleAttributes.push_back(da);
					}

					else if (type == "floatReal") {
						FloatAttribute fa;
						fa.loValue = loFloat;
						fa.hiValue = hiFloat;
						fa.description = description;
						fa.name = name;
						floatAttributes.push_back(fa);
					}

					else {
						errorCode = INVALID_TYPE;
						break;
					}
				}

				else {
					errorCode = MISSING_TYPE;
					break;
				}
			}

			else {
				errorCode = INVALID_DESCRIPTION;
				break;
			}
		}

		else {
			errorCode = INVALID_NAME;
			return false;
		}
	}

	readDatasetFile(file2);

	for (int i = 1; i <= 7; i++) {
		std::cout << i << '\t' << dbl_min[i];
		std::cout << '\t' << dbl_max[i];
		std::cout << std::endl;
	}

	std::cout << "8\t" << int_min << '\t' << int_max;
	std::cout << std::endl;
	std::cout << std::endl;

	for (int i = 0; i < static_cast<int>(categoricalAttributes.size()); i++) {
		std::cout << categoricalAttributes[i].name << ' ';
		std::cout << categoricalAttributes[i].description << ' ';
		std::cout << std::endl;
	}

	for (int i = 0; i < static_cast<int>(doubleAttributes.size()); i++) {
		std::cout << doubleAttributes[i].name << ' ';
		std::cout << doubleAttributes[i].description << ' ';
		std::cout << doubleAttributes[i].loValue << ' ';
		std::cout << doubleAttributes[i].hiValue;
		std::cout << std::endl;
	}

	for (int i = 0; i < static_cast<int>(floatAttributes.size()); i++) {
		std::cout << floatAttributes[i].name << ' ';
		std::cout << floatAttributes[i].description << ' ';
		std::cout << floatAttributes[i].loValue << ' ';
		std::cout << floatAttributes[i].hiValue;
		std::cout << std::endl;
	}

	for (int i = 0; i < static_cast<int>(integerAttributes.size()); i++) {
		std::cout << integerAttributes[i].name << ' ';
		std::cout << integerAttributes[i].description << ' ';
		std::cout << integerAttributes[i].loValue << ' ';
		std::cout << integerAttributes[i].hiValue;
		std::cout << std::endl;
	}

	file1.close();
	return 0;
}

Unknown's avatar

Author: jamespatewilliamsjr

My whole legal name is James Pate Williams, Jr. I was born in LaGrange, Georgia approximately 70 years ago. I barely graduated from LaGrange High School with low marks in June 1971. Later in June 1979, I graduated from LaGrange College with a Bachelor of Arts in Chemistry with a little over a 3 out 4 Grade Point Average (GPA). In the Spring Quarter of 1978, I taught myself how to program a Texas Instruments desktop programmable calculator and in the Summer Quarter of 1978 I taught myself Dayton BASIC (Beginner's All-purpose Symbolic Instruction Code) on LaGrange College's Data General Eclipse minicomputer. I took courses in BASIC in the Fall Quarter of 1978 and FORTRAN IV (Formula Translator IV) in the Winter Quarter of 1979. Professor Kenneth Cooper, a genius poly-scientist taught me a course in the Intel 8085 microprocessor architecture and assembly and machine language. We would hand assemble our programs and insert the resulting machine code into our crude wooden box computer which was designed and built by Professor Cooper. From 1990 to 1994 I earned a Bachelor of Science in Computer Science from LaGrange College. I had a 4 out of 4 GPA in the period 1990 to 1994. I took courses in C, COBOL, and Pascal during my BS work. After graduating from LaGrange College a second time in May 1994, I taught myself C++. In December 1995, I started using the Internet and taught myself client-server programming. I created a website in 1997 which had C and C# implementations of algorithms from the "Handbook of Applied Cryptography" by Alfred J. Menezes, et. al., and some other cryptography and number theory textbooks and treatises.

Leave a comment