#include "arff.hpp" #include "log.hpp" #include #include #include #include namespace ARFF { void ParseArguments(int argc, char* argv[]) { std::string argument_string; for (int i = 0; i < argc; ++i) { argument_string.assign(argv[i]); if (argument_string == "-v" || argument_string == "--verbose") { debug::verbose = true; } } } std::string GetDataFilename(void) { std::string filename; std::cout << "Please enter name of the data file:\t"; std::cin >> filename; if (filename.empty()) { debug::Log(kError, "No data filename provided, exiting..."); exit(1); } std::cout << std::endl; return filename; } AttributeType::AttributeType(std::string attribute) { this->attribute = attribute; } void AttributeType::AddValue(std::string value) { values.emplace_back(value); } Instance::Instance(const int id, const int size) { this->id = id; this->values.resize(size); } AttributeEvaluation::AttributeEvaluation(AttributeType *attribute) { this->currentAttribute = attribute; } // Read entire data file and parse it void Arff::Read(std::string filename) { std::ifstream dataFile(filename); if (!dataFile.is_open()) { debug::Log(kError, "Unable to open file with name `" + filename + ", exiting..."); exit(1); } std::string line; while (std::getline(dataFile, line)) { if ((line.size() == 0) || (line.size() == 1)) { continue; } switch (line.at(0)) { case '%': // Comment line in data continue; break; case '@': AddAttribute(line); break; default: AddData(line); break; } } TestIntegrity(); } // Print generic data information // Number of attributes and size of database void Arff::PrintOverview(void) { std::cout << attributeList.size() << " attributes\n"; std::cout << database.size() << " examples\n"; std::cout << std::endl; } // Print full data information void Arff::PrintData(void) { std::cout << "Attribute (#): values\n"; for (AttributeType type : attributeList) { std::cout << type.attribute << " (" << type.values.size() << "):"; for (std::string value : type.values) { if (value == "?") { continue; } std::cout << " " << value; } std::cout << '\n'; } std::cout << std::endl; std::cout << relation << '\n'; for (Instance instance: database) { for (std::string value : instance.values) { std::cout << '\t' << value; } std::cout << '\n'; } std::cout << std::endl; } // Print result of applying OneR // TODO: Create function void Arff::OneR(void) { AttributeEvaluation bestAttribute = _OneR(); debug::Log(kNone, "***Best 1-rule***"); debug::Log(kNone, "\t" + bestAttribute.currentAttribute->attribute + ':'); for (auto it : bestAttribute.rules) { debug::Log(kNone, "\t\t" + it.first + " ---> " + it.second); } debug::Log(kNone, "Error rate: " + std::to_string(bestAttribute.totalError) + "/" + std::to_string(database.size())); } // Add the attribute to the list void Arff::AddAttribute(std::string line) { std::stringstream parser(line); std::string token; parser >> token; // Signifies beginning of data // Might add a boolean later to mark this if (token == "@data" || token == "@DATA") { return; } if (token == "@relation" || token == "@RELATION") { parser >> token; relation = token; debug::Log(kLog, "Relation set: " + relation); return; } parser >> token; attributeList.emplace_back(token); debug::Log(kLog, "Added attribute: " + token); while (std::getline(parser, token, ',')) { // Clean token from outside pieces token.erase(std::remove(token.begin(), token.end(), ' '), token.end()); token.erase(std::remove(token.begin(), token.end(), '\t'), token.end()); token.erase(std::remove(token.begin(), token.end(), '{'), token.end()); token.erase(std::remove(token.begin(), token.end(), '}'), token.end()); token.erase(std::remove(token.begin(), token.end(), ','), token.end()); token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); attributeList.back().AddValue(token); debug::Log(kLog, "Added value: " + token); } // Additional missing value case attributeList.back().AddValue("?"); } // Add data to runtime database void Arff::AddData(std::string line) { std::istringstream parser(line); std::string token; int id = 0; if (!database.empty()) { id = database.back().id + 1; } database.emplace_back(id, attributeList.size()); debug::Log(kLog, "Added id: " + std::to_string(database.back().id)); for (int i = 0; i < attributeList.size(); ++i) { std::getline(parser, token, ','); token.erase(std::remove(token.begin(), token.end(), ' '), token.end()); token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); database.back().values.at(i) = token; debug::Log(kLog, "Added instance value: " + token); } } void Arff::TestIntegrity(void) { for (Instance instance : database) { int successCheck = 0; for (int i = 0; i < attributeList.size(); ++i) { debug::Log(kTrace, "Instance value tested: '" + instance.values.at(i) + "'"); for (std::string value : attributeList.at(i).values) { debug::Log(kTrace, "attributeList value: '" + value + "'"); if (instance.values.at(i) == value) { debug::Log(kTrace, "Value found: " + value); ++successCheck; break; } } } if (successCheck != attributeList.size()) { debug::Log(kError, "Value size mismatch: " + std::to_string(successCheck) + " out of " + std::to_string(attributeList.size())); exit(1); } } debug::Log(kLog, "All values exist, continuing..."); } // Perform OneR on data that was previously read in AttributeEvaluation Arff::_OneR(void) { AttributeEvaluation bestEvaluation; bestEvaluation.totalErrorRate = 1.0f; // -1 used for ignoring test rule (eg, play=yes/no) for (int i = 0; i < attributeList.size() - 1; ++i) { AttributeEvaluation evaluation = EvaluateAttribute(&attributeList[i], i); if (evaluation.totalErrorRate < bestEvaluation.totalErrorRate) { bestEvaluation = evaluation; bestEvaluation.currentAttribute = evaluation.currentAttribute; } debug::Log(kLog, "Evaluation on " + evaluation.currentAttribute->attribute + " completed"); } return bestEvaluation; } // Determine error rate and best option for each value of an attribute // Originally set up to use OneR AttributeEvaluation Arff::EvaluateAttribute(AttributeType *attribute, const int attributePos) { AttributeEvaluation evaluation(attribute); std::map results; for (std::string value : attributeList.end()->values) { results.emplace(value, 0); } for (int i = 0; i < attribute->values.size(); ++i) { if (attribute->values[i] == "?") { continue; } for (auto instance = database.begin(); instance != database.end(); ++instance) { if (instance->values[attributePos] != attribute->values[i]) { continue; } ++results[instance->values.back()]; } debug::Log(kTrace, "Results:"); for (auto it : results) { debug::Log(kTrace, "\t" + it.first + ": " + std::to_string(it.second)); } int lowest = 9999; std::string bestResult = results.begin()->first; for (auto it = results.begin(); it != results.end(); ++it) { if (it->second < lowest) { lowest = it->second; } else { bestResult = it->first; } } evaluation.rules.emplace(attribute->values[i], bestResult); evaluation.totalError += lowest; debug::Log(kLog, "Added rule " + attribute->values[i] + "->" + bestResult); // Reset for (auto it = results.begin(); it != results.end(); ++it) { it->second = 0; } } evaluation.totalErrorRate = evaluation.totalError / float(database.size()); return evaluation; } }