2024-03-18 19:19:47 -05:00
|
|
|
#include "arff.hpp"
|
2024-04-17 15:26:27 -05:00
|
|
|
#include "log.hpp"
|
2024-03-18 19:19:47 -05:00
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <sstream>
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
namespace ARFF {
|
|
|
|
void ParseArguments(int argc, char* argv[]) {
|
|
|
|
std::string argument_string;
|
|
|
|
for (int i = 0; i < argc; ++i) {
|
|
|
|
argument_string.assign(argv[i]);
|
|
|
|
if (argument_string == "-v" || argument_string == "--verbose") {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::verbose = true;
|
2024-03-18 19:19:47 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string GetDataFilename(void) {
|
|
|
|
std::string filename;
|
|
|
|
std::cout << "Please enter name of the data file:\t";
|
|
|
|
std::cin >> filename;
|
|
|
|
if (filename.empty()) {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kError, "No data filename provided, exiting...");
|
2024-03-18 19:19:47 -05:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
std::cout << std::endl;
|
|
|
|
return filename;
|
|
|
|
}
|
|
|
|
|
|
|
|
AttributeType::AttributeType(std::string attribute) {
|
|
|
|
this->attribute = attribute;
|
|
|
|
}
|
|
|
|
|
|
|
|
void AttributeType::AddValue(std::string value) {
|
|
|
|
values.emplace_back(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
Instance::Instance(const int id, const int size) {
|
|
|
|
this->id = id;
|
|
|
|
this->values.resize(size);
|
|
|
|
}
|
|
|
|
|
2024-04-17 21:03:49 -05:00
|
|
|
AttributeEvaluation::AttributeEvaluation(AttributeType *attribute) {
|
|
|
|
this->currentAttribute = attribute;
|
|
|
|
}
|
|
|
|
|
2024-03-18 19:19:47 -05:00
|
|
|
// Read entire data file and parse it
|
|
|
|
void Arff::Read(std::string filename) {
|
|
|
|
std::ifstream dataFile(filename);
|
|
|
|
if (!dataFile.is_open()) {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kError, "Unable to open file with name `"
|
2024-03-18 19:19:47 -05:00
|
|
|
+ filename + ", exiting...");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
std::string line;
|
|
|
|
while (std::getline(dataFile, line)) {
|
2024-04-08 17:22:47 -05:00
|
|
|
if ((line.size() == 0) || (line.size() == 1)) { continue; }
|
2024-03-18 19:19:47 -05:00
|
|
|
switch (line.at(0)) {
|
|
|
|
case '%':
|
|
|
|
// Comment line in data
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
case '@':
|
|
|
|
AddAttribute(line);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
AddData(line);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
TestIntegrity();
|
|
|
|
}
|
|
|
|
|
2024-04-17 14:43:47 -05:00
|
|
|
// Print generic data information
|
|
|
|
// Number of attributes and size of database
|
|
|
|
void Arff::PrintOverview(void) {
|
2024-03-18 19:52:05 -05:00
|
|
|
std::cout << attributeList.size() << " attributes\n";
|
|
|
|
std::cout << database.size() << " examples\n";
|
|
|
|
std::cout << std::endl;
|
2024-04-17 14:43:47 -05:00
|
|
|
}
|
2024-03-18 19:52:05 -05:00
|
|
|
|
2024-04-17 14:43:47 -05:00
|
|
|
// Print full data information
|
|
|
|
void Arff::PrintData(void) {
|
2024-03-18 19:52:05 -05:00
|
|
|
std::cout << "Attribute (#): values\n";
|
|
|
|
for (AttributeType type : attributeList) {
|
|
|
|
std::cout << type.attribute << " (" << type.values.size() << "):";
|
|
|
|
for (std::string value : type.values) {
|
2024-04-08 17:43:07 -05:00
|
|
|
if (value == "?") { continue; }
|
2024-03-18 19:52:05 -05:00
|
|
|
std::cout << " " << value;
|
|
|
|
}
|
|
|
|
std::cout << '\n';
|
|
|
|
}
|
|
|
|
std::cout << std::endl;
|
|
|
|
|
|
|
|
std::cout << relation << '\n';
|
|
|
|
for (Instance instance: database) {
|
|
|
|
for (std::string value : instance.values) {
|
|
|
|
std::cout << '\t' << value;
|
|
|
|
}
|
|
|
|
std::cout << '\n';
|
|
|
|
}
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
|
2024-04-17 14:43:47 -05:00
|
|
|
// Print result of applying OneR
|
|
|
|
// TODO: Create function
|
2024-04-17 21:03:49 -05:00
|
|
|
void Arff::OneR(void) {
|
|
|
|
AttributeEvaluation bestAttribute = _OneR();
|
|
|
|
debug::Log(kNone, "***Best 1-rule***");
|
|
|
|
debug::Log(kNone, "\t" + bestAttribute.currentAttribute->attribute + ':');
|
|
|
|
for (auto it : bestAttribute.rules) {
|
|
|
|
debug::Log(kNone, "\t\t" + it.first + " ---> " + it.second);
|
|
|
|
}
|
|
|
|
debug::Log(kNone, "Error rate: " + std::to_string(bestAttribute.totalError) + "/" + std::to_string(database.size()));
|
2024-04-17 14:43:47 -05:00
|
|
|
}
|
|
|
|
|
2024-04-17 21:03:49 -05:00
|
|
|
|
2024-03-18 19:19:47 -05:00
|
|
|
// Add the attribute to the list
|
|
|
|
void Arff::AddAttribute(std::string line) {
|
|
|
|
std::stringstream parser(line);
|
|
|
|
std::string token;
|
|
|
|
parser >> token;
|
|
|
|
// Signifies beginning of data
|
|
|
|
// Might add a boolean later to mark this
|
2024-04-08 18:03:46 -05:00
|
|
|
if (token == "@data" || token == "@DATA") {
|
2024-03-18 19:19:47 -05:00
|
|
|
return;
|
|
|
|
}
|
2024-04-08 18:03:46 -05:00
|
|
|
if (token == "@relation" || token == "@RELATION") {
|
2024-03-18 19:19:47 -05:00
|
|
|
parser >> token;
|
|
|
|
relation = token;
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kLog, "Relation set: " + relation);
|
2024-03-18 19:19:47 -05:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
parser >> token;
|
|
|
|
attributeList.emplace_back(token);
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kLog, "Added attribute: " + token);
|
2024-04-08 17:43:07 -05:00
|
|
|
while (std::getline(parser, token, ',')) {
|
2024-03-18 19:19:47 -05:00
|
|
|
// Clean token from outside pieces
|
2024-04-08 17:43:07 -05:00
|
|
|
token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
|
|
|
|
token.erase(std::remove(token.begin(), token.end(), '\t'), token.end());
|
2024-03-18 19:19:47 -05:00
|
|
|
token.erase(std::remove(token.begin(), token.end(), '{'), token.end());
|
|
|
|
token.erase(std::remove(token.begin(), token.end(), '}'), token.end());
|
|
|
|
token.erase(std::remove(token.begin(), token.end(), ','), token.end());
|
|
|
|
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
|
|
|
|
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
|
|
|
|
attributeList.back().AddValue(token);
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kLog, "Added value: " + token);
|
2024-03-18 19:19:47 -05:00
|
|
|
}
|
2024-04-08 17:43:07 -05:00
|
|
|
// Additional missing value case
|
|
|
|
attributeList.back().AddValue("?");
|
2024-03-18 19:19:47 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add data to runtime database
|
|
|
|
void Arff::AddData(std::string line) {
|
|
|
|
std::istringstream parser(line);
|
|
|
|
std::string token;
|
|
|
|
int id = 0;
|
|
|
|
if (!database.empty()) { id = database.back().id + 1; }
|
|
|
|
database.emplace_back(id, attributeList.size());
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kLog, "Added id: " + std::to_string(database.back().id));
|
2024-03-18 19:19:47 -05:00
|
|
|
for (int i = 0; i < attributeList.size(); ++i) {
|
|
|
|
std::getline(parser, token, ',');
|
2024-04-08 17:43:07 -05:00
|
|
|
token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
|
2024-03-18 19:19:47 -05:00
|
|
|
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
|
|
|
|
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
|
|
|
|
database.back().values.at(i) = token;
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kLog, "Added instance value: " + token);
|
2024-03-18 19:19:47 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Arff::TestIntegrity(void) {
|
|
|
|
for (Instance instance : database) {
|
|
|
|
int successCheck = 0;
|
|
|
|
for (int i = 0; i < attributeList.size(); ++i) {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kTrace, "Instance value tested: '"
|
2024-03-18 19:19:47 -05:00
|
|
|
+ instance.values.at(i) + "'");
|
|
|
|
for (std::string value : attributeList.at(i).values) {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kTrace, "attributeList value: '"
|
2024-03-18 19:19:47 -05:00
|
|
|
+ value + "'");
|
|
|
|
if (instance.values.at(i) == value) {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kTrace, "Value found: " + value);
|
2024-03-18 19:19:47 -05:00
|
|
|
++successCheck;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (successCheck != attributeList.size()) {
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kError, "Value size mismatch: "
|
2024-03-18 19:19:47 -05:00
|
|
|
+ std::to_string(successCheck) + " out of "
|
|
|
|
+ std::to_string(attributeList.size()));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
2024-04-17 15:26:27 -05:00
|
|
|
debug::Log(kLog, "All values exist, continuing...");
|
2024-03-18 19:19:47 -05:00
|
|
|
}
|
2024-04-17 21:03:49 -05:00
|
|
|
|
|
|
|
// Perform OneR on data that was previously read in
|
|
|
|
AttributeEvaluation Arff::_OneR(void) {
|
|
|
|
AttributeEvaluation bestEvaluation;
|
|
|
|
bestEvaluation.totalErrorRate = 1.0f;
|
|
|
|
// -1 used for ignoring test rule (eg, play=yes/no)
|
|
|
|
for (int i = 0; i < attributeList.size() - 1; ++i) {
|
|
|
|
AttributeEvaluation evaluation = EvaluateAttribute(&attributeList[i], i);
|
|
|
|
if (evaluation.totalErrorRate < bestEvaluation.totalErrorRate) {
|
|
|
|
bestEvaluation = evaluation;
|
|
|
|
bestEvaluation.currentAttribute = evaluation.currentAttribute;
|
|
|
|
}
|
|
|
|
debug::Log(kLog, "Evaluation on " + evaluation.currentAttribute->attribute + " completed");
|
|
|
|
}
|
|
|
|
return bestEvaluation;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Determine error rate and best option for each value of an attribute
|
|
|
|
// Originally set up to use OneR
|
|
|
|
AttributeEvaluation Arff::EvaluateAttribute(AttributeType *attribute, const int attributePos) {
|
|
|
|
AttributeEvaluation evaluation(attribute);
|
|
|
|
std::map<std::string, int> results;
|
|
|
|
for (std::string value : attributeList.end()->values) {
|
|
|
|
results.emplace(value, 0);
|
|
|
|
}
|
|
|
|
for (int i = 0; i < attribute->values.size(); ++i) {
|
2024-04-17 21:25:11 -05:00
|
|
|
int resultsTotal = 0;
|
2024-04-17 21:03:49 -05:00
|
|
|
if (attribute->values[i] == "?") { continue; }
|
|
|
|
for (auto instance = database.begin(); instance != database.end(); ++instance) {
|
|
|
|
if (instance->values[attributePos] != attribute->values[i]) { continue; }
|
|
|
|
++results[instance->values.back()];
|
2024-04-17 21:25:11 -05:00
|
|
|
resultsTotal++;
|
2024-04-17 21:03:49 -05:00
|
|
|
}
|
|
|
|
debug::Log(kTrace, "Results:");
|
|
|
|
for (auto it : results) { debug::Log(kTrace, "\t" + it.first + ": " + std::to_string(it.second)); }
|
2024-04-17 21:25:11 -05:00
|
|
|
float lowest;
|
|
|
|
float lowestRate = 1.0f;
|
2024-04-17 21:03:49 -05:00
|
|
|
std::string bestResult = results.begin()->first;
|
|
|
|
for (auto it = results.begin(); it != results.end(); ++it) {
|
2024-04-17 21:25:11 -05:00
|
|
|
if (((resultsTotal - it->second) / float(resultsTotal)) < lowestRate) {
|
|
|
|
lowestRate = ((resultsTotal - it->second) / float(resultsTotal));
|
|
|
|
lowest = resultsTotal - it->second;
|
2024-04-17 21:03:49 -05:00
|
|
|
bestResult = it->first;
|
2024-04-17 21:25:11 -05:00
|
|
|
}
|
2024-04-17 21:03:49 -05:00
|
|
|
}
|
|
|
|
evaluation.rules.emplace(attribute->values[i], bestResult);
|
|
|
|
evaluation.totalError += lowest;
|
|
|
|
debug::Log(kLog, "Added rule " + attribute->values[i] + "->" + bestResult);
|
|
|
|
// Reset
|
|
|
|
for (auto it = results.begin(); it != results.end(); ++it) {
|
|
|
|
it->second = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
evaluation.totalErrorRate = evaluation.totalError / float(database.size());
|
|
|
|
return evaluation;
|
|
|
|
}
|
2024-03-18 19:19:47 -05:00
|
|
|
}
|