// ID3MetadataParser.cpp (c) December 2025
// by James Pate Williams, Jr.
#include "pch.h"
#define FILE_EOF 0
#define NO_ERROR 1
#define EMPTY_FILE 2
#define INVALID_LINE 3
#define MISSING_NAME 4
#define INVALID_NAME 5
#define INVALID_DESCRIPTION 6
#define MISSING_DESCRIPTION 7
#define INVALID_TYPE 8
#define MISSING_TYPE 9
#define INVALID_RANGE 10
#define INVALID_CATEGORICAL 11
#define INVALID_DOUBLE 12
#define INVALID_FLOAT 13
#define INVALID_INTEGER 14
#define INVALID_ROLE 15
#define MISSING_ROLE 16
enum AttributeType {
categorical, integer, doubleReal, FloatReal
};
typedef struct tagCategoricalAttribute {
std::string name = "";
std::string description = "";
std::vector<char> category;
} CategoricalAttribute, * PCategoricalAttribute;
typedef struct tagIntegerAttribute {
std::string name = "";
std::string description = "";
int loValue= -1, hiValue = -1;
} IntegerAttribute, * PIntegerAttribute;
typedef struct tagDoubleAttribute {
std::string name = "";
std::string description = "";
double loValue = -1.0, hiValue = -1.0;
} DoubleAttribute, * PDoubleAttribute;
typedef struct tagFloatAttribute {
std::string name = "";
std::string description = "";
float loValue = -1.0f, hiValue = -1.0f;
} FloatAttribute, * PFloatAttribute;
static bool parseName(
char line[],
int length,
int& errorCode,
int& index,
bool& feature,
std::string& name)
{
char* cptr1 = std::strstr(line, "#name: feature ");
char* cptr2 = std::strstr(line, "#name: target ");
if (cptr1 == nullptr && cptr2 == nullptr) {
errorCode = MISSING_NAME;
return false;
}
if (cptr1) {
feature = true;
index = static_cast<int>(strlen("#name: feature "));
}
else if (cptr2) {
feature = false;
index = static_cast<int>(strlen("#name: target "));
}
else {
errorCode = INVALID_NAME;
return false;
}
if (index >= static_cast<int>(strlen(line))) {
errorCode = INVALID_NAME;
return false;
}
if (line[index] >= L'A' && line[index] <= 'Z' ||
line[index] >= L'a' && line[index] <= 'z') {
bool first = true;
name = "";
while (index < strlen(line)) {
if (line[index] >= 'A' && line[index] <= 'Z' ||
line[index] >= 'a' && line[index] <= 'z' ||
line[index] == ' ') {
if (first)
name += line[index++];
else if (first &&
line[index] >= '0' &&
line[index] <= '9') {
first = false;
name += line[index++];
}
if (!first)
name += line[index++];
}
else if (!first) {
errorCode = INVALID_NAME;
return false;
}
}
}
errorCode = 0;
index = length;
return true;
}
static bool parseDescription(
char line[],
int length,
int& errorCode,
int& index,
std::string& description) {
char* cptr = std::strstr(line, "#description: ");
if (cptr == nullptr) {
errorCode = MISSING_DESCRIPTION;
return false;
}
int lengthDesc = static_cast<int>(
strlen("#description: "));
if (lengthDesc == length) {
errorCode = INVALID_DESCRIPTION;
return false;
}
index = lengthDesc;
while (index < length)
description += line[index++];
errorCode = 0;
return true;
}
static bool parseCategorical(
char line[],
int length,
int& errorCode,
int& index,
std::vector<char>& category) {
int delta = static_cast<int>(strlen("#type: categorical: {"));
char* cptr = line + delta - 1;
char ch = *cptr++;
while (ch != '}' && index < length) {
while (ch != ',' && index < length) {
if (ch == '}') {
if (index == length - 1)
break;
else {
errorCode = INVALID_TYPE;
return false;
}
}
category.push_back(ch);
index++;
break;
}
cptr++;
ch = *cptr;
}
if (category.size() != 0 && ch == '}') {
errorCode = 0;
return true;
}
else {
errorCode = INVALID_CATEGORICAL;
return false;
}
}
static bool parseDoubleRange(
char line[],
int length,
int& errorCode,
int& index,
double& hiDouble,
double& loDouble)
{
index = static_cast<int>(strlen("#type: doubleReal ["));
char ch = line[index++];
std::string doubleStr;
while (ch != ',' &&
index < static_cast<int>(strlen(line))) {
doubleStr.push_back(ch);
ch = line[index++];
}
if (doubleStr.size() == 0) {
errorCode = INVALID_DOUBLE;
return false;
}
try {
loDouble = std::stod(doubleStr);
doubleStr = "";
ch = line[index++];
while (ch != ']' && index < strlen(line)) {
doubleStr.push_back(ch);
ch = line[index++];
}
if (doubleStr.size() == 0) {
errorCode = INVALID_DOUBLE;
return false;
}
hiDouble = std::stod(doubleStr);
errorCode = 0;
return true;
}
catch (std::exception ex) {
errorCode = INVALID_DOUBLE;
return false;
}
errorCode = INVALID_RANGE;
return false;
}
static bool parseFloatRange(
char line[],
int length,
int& errorCode,
int& index,
float& hiFloat,
float& loFloat)
{
char ch = '\0';
std::string floatStr;
ch = line[index++];
while (ch != ',' && index < strlen(line)) {
floatStr.push_back(ch);
ch = line[index++];
}
if (floatStr.size() == 0) {
errorCode = INVALID_INTEGER;
return false;
}
else {
try {
loFloat = std::stof(floatStr);
floatStr = "";
ch = line[++index];
while (ch != ']' && index < strlen(line)) {
floatStr.push_back(ch);
ch = line[index++];
}
if (floatStr.size() == 0) {
errorCode = INVALID_FLOAT;
return false;
}
hiFloat = std::stof(floatStr);
errorCode = 0;
return true;
}
catch (std::exception ex) {
errorCode = INVALID_FLOAT;
return false;
}
}
errorCode = INVALID_RANGE;
return false;
}
static bool parseIntegerRange(
char line[],
int length,
int& errorCode,
int& index,
int& hiInteger,
int& loInteger) {
char ch = '\0';
int i = 0;
std::string integerStr;
ch = line[i++];
if (ch < '0' || ch > '9') {
errorCode = INVALID_INTEGER;
return false;
}
while (ch != ',' && index < length) {
integerStr.push_back(ch);
ch = line[i++];
index++;
}
if (integerStr.size() == 0) {
errorCode = INVALID_INTEGER;
return false;
}
else {
try {
loInteger = std::stoi(integerStr);
integerStr = "";
i = 0;
ch = line[i++];
ch = line[i++];
ch = line[i++];
index += 3;
while (
ch >= '0' && ch <= '9' &&
ch != ']' && index < length) {
integerStr.push_back(ch);
ch = line[i++];
index++;
}
if (integerStr.size() == 0) {
errorCode = INVALID_INTEGER;
return false;
}
hiInteger = std::stoi(integerStr);
errorCode = 0;
return true;
}
catch (std::exception ex) {
errorCode = INVALID_INTEGER;
return false;
}
}
errorCode = INVALID_RANGE;
return false;
}
static bool parseType(
char line[],
int length,
double& hiDouble,
double& loDouble,
float& hiFloat,
float& loFloat,
int& errorCode,
int& index,
int& hiInteger,
int& loInteger,
std::string& type,
std::vector<char>& alphabet) {
char* cptr = std::strstr(line, "#type: ");
if (cptr == nullptr) {
errorCode = MISSING_TYPE;
return false;
}
int lengthType = static_cast<int>(strlen("#type: "));
if (lengthType >= length) {
errorCode = INVALID_TYPE;
return false;
}
index = lengthType;
cptr = line + index;
if (std::strstr(cptr, "categorical {") != nullptr) {
if (parseCategorical(line, length, errorCode,
index, alphabet)) {
errorCode = 0;
type = "categorical";
return true;
}
else {
errorCode = INVALID_CATEGORICAL;
return false;
}
}
if (std::strstr(cptr, "integer [") != nullptr) {
bool pir = parseIntegerRange(
line + index + strlen("integer ["),
length,
errorCode,
index,
hiInteger,
loInteger);
if (pir) {
type = "integer";
return true;
}
else
return false;
}
if (std::strstr(cptr, "doubleReal [") != nullptr) {
bool pdr = parseDoubleRange(
line,
length,
errorCode,
index,
hiDouble,
loDouble);
if (pdr) {
type = "doubleReal";
return true;
}
else
return false;
}
if (std::strstr(cptr, "floatReal [") != nullptr) {
bool pfr = parseFloatRange(
line,
length,
errorCode,
index,
hiFloat,
loFloat);
if (pfr) {
type = "floatReal";
return true;
}
}
errorCode = INVALID_TYPE;
return false;
}
static bool readMetaDataLine(
std::ifstream& file1,
char line[],
int& errorCode,
int& index) {
file1.getline(line, 256);
if (strlen(line) == 0 && index == -1) {
errorCode = EMPTY_FILE;
return false;
}
if (file1.eof()) {
errorCode = 0;
index = FILE_EOF;
return false;
}
if (strlen(line) > 0 &&
std::strstr(line, "#endheader") != nullptr)
return false;
if (strlen(line) > 0)
return true;
else
return false;
}
double dbl_max[8] = { 0 };
double dbl_min[8] = { 0 };
int int_max = 0;
int int_min = 0;
static void readDatasetFile(
std::ifstream& file2) {
char line[256] = "";
for (int i = 0; i < 8; i++) {
dbl_min[i] = DBL_MAX;
dbl_max[i] = DBL_MIN;
}
int_min = INT_MAX;
int_max = INT_MIN;
while (!file2.eof()) {
file2.getline(line, 256);
int count = 0, index = 0;
while (
count <= 9 &&
index < static_cast<int>(strlen(line))) {
char ch = line[index++], subline[256] = "";
int cp = 0;
while (ch != ',' && cp < static_cast<int>(strlen(line))) {
subline[cp++] = ch;
ch = line[index++];
}
count++;
if (strlen(subline) >= 1)
subline[cp] = '\0';
if (count >= 1 && count <= 8 && cp > 1) {
std::string substr(subline);
double x = std::stod(subline);
if (x > dbl_max[count - 1])
dbl_max[count - 1] = x;
if (x < dbl_min[count - 1])
dbl_min[count - 1] = x;
}
else if (count == 9 && !(
strstr(subline, "F") ||
strstr(subline, "I") ||
strstr(subline, "M"))) {
std::string substr(subline);
int x = std::stoi(substr);
if (x > int_max)
int_max = x;
if (x < int_min)
int_min = x;
}
}
}
file2.close();
}
int main()
{
bool feature = false;
char filename1[256] = "C:\\Users\\James\\OneDrive\\Desktop\\ID3MetadataParser\\x64\\Debug\\ID3MetadataParserDataFile.txt";
char filename2[256] = "C:\\Users\\James\\OneDrive\\Desktop\\ID3MetadataParser\\x64\\Debug\\abalone.data.txt";
char line[256] = "";
int errorCode = -1, index = -1, role = -1;
std::ifstream file1(filename1);
std::ifstream file2(filename2);
// file1 format
std::string name, description, type;
std::vector<char> category;
// file2 format
std::string cat, length, diameter, height, whole;
std::string shucked, viscera, shell, rings;
std::vector<CategoricalAttribute> categoricalAttributes;
std::vector<IntegerAttribute> integerAttributes;
std::vector<DoubleAttribute> doubleAttributes;
std::vector<FloatAttribute> floatAttributes;
std::vector<std::string> names;
std::vector<std::string> descriptions;
std::vector<std::string> types;
while (!file1.eof()) {
index = -1;
bool result = readMetaDataLine(
file1,
line,
errorCode,
index);
if (!result)
break;
index = 0;
int length = static_cast<int>(strlen(line));
if (length == 0)
break;
name = "";
bool pn = parseName(
line,
length,
errorCode,
index,
feature,
name);
if (pn) {
bool result = readMetaDataLine(
file1,
line,
errorCode,
index);
if (!result)
break;
length = static_cast<int>(strlen(line));
index = 0;
description = "";
bool pd = parseDescription(
line,
length,
errorCode,
index,
description);
if (pd) {
bool result = readMetaDataLine(
file1,
line,
errorCode,
index);
if (!result)
break;
length = static_cast<int>(strlen(line));
index = 0;
type = "";
double hiDouble = DBL_MIN;
double loDouble = DBL_MAX;
float hiFloat = FLT_MIN;
float loFloat = FLT_MAX;
int hiInteger = INT_MIN;
int loInteger = INT_MAX;
bool pt = parseType(
line,
length,
hiDouble,
loDouble,
hiFloat,
loFloat,
errorCode,
index,
hiInteger,
loInteger,
type,
category);
length = static_cast<int>(strlen(line));
if (pt) {
if (type == "categorical") {
CategoricalAttribute ca;
ca.category = category;
ca.description = description;
ca.name = name;
categoricalAttributes.push_back(ca);
}
else if (type == "integer") {
IntegerAttribute ia;
ia.loValue = loInteger;
ia.hiValue = hiInteger;
ia.description = description;
ia.name = name;
integerAttributes.push_back(ia);
}
else if (type == "doubleReal") {
DoubleAttribute da;
da.loValue = loDouble;
da.hiValue = hiDouble;
da.description = description;
da.name = name;
doubleAttributes.push_back(da);
}
else if (type == "floatReal") {
FloatAttribute fa;
fa.loValue = loFloat;
fa.hiValue = hiFloat;
fa.description = description;
fa.name = name;
floatAttributes.push_back(fa);
}
else {
errorCode = INVALID_TYPE;
break;
}
}
else {
errorCode = MISSING_TYPE;
break;
}
}
else {
errorCode = INVALID_DESCRIPTION;
break;
}
}
else {
errorCode = INVALID_NAME;
return false;
}
}
readDatasetFile(file2);
for (int i = 1; i <= 7; i++) {
std::cout << i << '\t' << dbl_min[i];
std::cout << '\t' << dbl_max[i];
std::cout << std::endl;
}
std::cout << "8\t" << int_min << '\t' << int_max;
std::cout << std::endl;
std::cout << std::endl;
for (int i = 0; i < static_cast<int>(categoricalAttributes.size()); i++) {
std::cout << categoricalAttributes[i].name << ' ';
std::cout << categoricalAttributes[i].description << ' ';
std::cout << std::endl;
}
for (int i = 0; i < static_cast<int>(doubleAttributes.size()); i++) {
std::cout << doubleAttributes[i].name << ' ';
std::cout << doubleAttributes[i].description << ' ';
std::cout << doubleAttributes[i].loValue << ' ';
std::cout << doubleAttributes[i].hiValue;
std::cout << std::endl;
}
for (int i = 0; i < static_cast<int>(floatAttributes.size()); i++) {
std::cout << floatAttributes[i].name << ' ';
std::cout << floatAttributes[i].description << ' ';
std::cout << floatAttributes[i].loValue << ' ';
std::cout << floatAttributes[i].hiValue;
std::cout << std::endl;
}
for (int i = 0; i < static_cast<int>(integerAttributes.size()); i++) {
std::cout << integerAttributes[i].name << ' ';
std::cout << integerAttributes[i].description << ' ';
std::cout << integerAttributes[i].loValue << ' ';
std::cout << integerAttributes[i].hiValue;
std::cout << std::endl;
}
file1.close();
return 0;
}