From 9027c0c4ff2837df5d56015a405d6794881f251d Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 13:53:21 -0500 Subject: [PATCH 1/8] oneR: update README for oneR --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 2190089..af083f7 100644 --- a/README.md +++ b/README.md @@ -18,3 +18,8 @@ ARFF: ```plain build/bin/arff ``` + +One Rule: +```plain +build/bin/onerule +``` From dcd6038c219614711a135f1048367569ca5e8c13 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:33:00 -0500 Subject: [PATCH 2/8] gitignore: don't track clang cache files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index eb18772..6e07d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,6 @@ # Build directory build + +# Clang files +.cache From 42daff8a4c5b2f2643f25c6e1fe82365c35d8046 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:33:42 -0500 Subject: [PATCH 3/8] oneR: initial folder and CMake creation --- src/CMakeLists.txt | 1 + src/arff/CMakeLists.txt | 4 ++-- src/onerule/CMakeLists.txt | 6 ++++++ src/onerule/main.cpp | 13 +++++++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 src/onerule/CMakeLists.txt create mode 100644 src/onerule/main.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9e6d78f..04a3160 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(arff) +add_subdirectory(onerule) diff --git a/src/arff/CMakeLists.txt b/src/arff/CMakeLists.txt index 524fe0c..4d7fd37 100644 --- a/src/arff/CMakeLists.txt +++ b/src/arff/CMakeLists.txt @@ -1,5 +1,5 @@ add_executable(arff - ./main.cpp - ./arff.cpp + main.cpp + arff.cpp ) diff --git a/src/onerule/CMakeLists.txt b/src/onerule/CMakeLists.txt new file mode 100644 index 0000000..43ed139 --- /dev/null +++ b/src/onerule/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories(../arff) +add_executable(onerule + main.cpp + ../arff/arff.cpp +) + diff --git a/src/onerule/main.cpp b/src/onerule/main.cpp new file mode 100644 index 0000000..4f24fae --- /dev/null +++ b/src/onerule/main.cpp @@ -0,0 +1,13 @@ +/* + * Author: Gregory Crawford + * Date: 2024-03-18 + * Description: Read and store ARFF data from a file + */ +#include "arff.hpp" + +int main(int argc, char* argv[]) { + ARFF::ParseArguments(argc, argv); + ARFF::Arff data; + data.Read(ARFF::GetDataFilename()); + data.Print(); +} From c49d160bd59dbc609b7f4c677233ecf133800d51 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:43:47 -0500 Subject: [PATCH 4/8] Split print function into three pieces --- src/arff/arff.cpp | 12 +++++++++++- src/arff/arff.hpp | 4 +++- src/arff/main.cpp | 3 ++- src/onerule/main.cpp | 2 +- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp index be00b51..bf1a6e6 100644 --- a/src/arff/arff.cpp +++ b/src/arff/arff.cpp @@ -81,11 +81,16 @@ namespace ARFF { TestIntegrity(); } - void Arff::Print(void) { + // Print generic data information + // Number of attributes and size of database + void Arff::PrintOverview(void) { std::cout << attributeList.size() << " attributes\n"; std::cout << database.size() << " examples\n"; std::cout << std::endl; + } + // Print full data information + void Arff::PrintData(void) { std::cout << "Attribute (#): values\n"; for (AttributeType type : attributeList) { std::cout << type.attribute << " (" << type.values.size() << "):"; @@ -107,6 +112,11 @@ namespace ARFF { std::cout << std::endl; } + // Print result of applying OneR + // TODO: Create function + void PrintOneR(void) { + } + // Add the attribute to the list void Arff::AddAttribute(std::string line) { std::stringstream parser(line); diff --git a/src/arff/arff.hpp b/src/arff/arff.hpp index 8afc7af..a38f688 100644 --- a/src/arff/arff.hpp +++ b/src/arff/arff.hpp @@ -29,7 +29,9 @@ namespace ARFF { public: Arff() = default; void Read(std::string filename); - void Print(void); + void PrintOverview(void); + void PrintData(void); + void PrintOneR(void); private: std::string relation; std::vector attributeList; diff --git a/src/arff/main.cpp b/src/arff/main.cpp index 4f24fae..5070438 100644 --- a/src/arff/main.cpp +++ b/src/arff/main.cpp @@ -9,5 +9,6 @@ int main(int argc, char* argv[]) { ARFF::ParseArguments(argc, argv); ARFF::Arff data; data.Read(ARFF::GetDataFilename()); - data.Print(); + data.PrintOverview(); + data.PrintData(); } diff --git a/src/onerule/main.cpp b/src/onerule/main.cpp index 4f24fae..ee0099d 100644 --- a/src/onerule/main.cpp +++ b/src/onerule/main.cpp @@ -9,5 +9,5 @@ int main(int argc, char* argv[]) { ARFF::ParseArguments(argc, argv); ARFF::Arff data; data.Read(ARFF::GetDataFilename()); - data.Print(); + data.PrintOverview(); } From 90539f7e38ba69b992563b851c875361006b86c1 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 15:26:27 -0500 Subject: [PATCH 5/8] log: split log function into another file and added levels --- src/arff/CMakeLists.txt | 1 + src/arff/arff.cpp | 41 +++++++++++++------------------------- src/arff/arff.hpp | 2 -- src/arff/log.cpp | 19 ++++++++++++++++++ src/arff/log.hpp | 19 ++++++++++++++++++ src/onerule/CMakeLists.txt | 1 + src/onerule/main.cpp | 2 ++ 7 files changed, 56 insertions(+), 29 deletions(-) create mode 100644 src/arff/log.cpp create mode 100644 src/arff/log.hpp diff --git a/src/arff/CMakeLists.txt b/src/arff/CMakeLists.txt index 4d7fd37..0fcdd9c 100644 --- a/src/arff/CMakeLists.txt +++ b/src/arff/CMakeLists.txt @@ -1,5 +1,6 @@ add_executable(arff main.cpp arff.cpp + log.cpp ) diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp index bf1a6e6..24c030e 100644 --- a/src/arff/arff.cpp +++ b/src/arff/arff.cpp @@ -1,18 +1,17 @@ #include "arff.hpp" +#include "log.hpp" #include #include #include #include namespace ARFF { - bool isVerbose = false; - void ParseArguments(int argc, char* argv[]) { std::string argument_string; for (int i = 0; i < argc; ++i) { argument_string.assign(argv[i]); if (argument_string == "-v" || argument_string == "--verbose") { - isVerbose = true; + debug::verbose = true; } } } @@ -22,25 +21,13 @@ namespace ARFF { std::cout << "Please enter name of the data file:\t"; std::cin >> filename; if (filename.empty()) { - LogError("ARFF/Setup", "No data filename provided, exiting..."); + debug::Log(kError, "No data filename provided, exiting..."); exit(1); } std::cout << std::endl; return filename; } - void LogInfo(const std::string location, const std::string message) { - if (!isVerbose) { return; } - std::cout << '[' << location << " - INFO] "; - std::cout << message << std::endl; - } - - void LogError(const std::string location, const std::string message) { - if (!isVerbose) { return; } - std::cerr << '[' << location << " - ERROR] "; - std::cerr << message << std::endl; - } - AttributeType::AttributeType(std::string attribute) { this->attribute = attribute; } @@ -58,7 +45,7 @@ namespace ARFF { void Arff::Read(std::string filename) { std::ifstream dataFile(filename); if (!dataFile.is_open()) { - LogError("ARFF/Read", "Unable to open file with name `" + debug::Log(kError, "Unable to open file with name `" + filename + ", exiting..."); exit(1); } @@ -130,12 +117,12 @@ namespace ARFF { if (token == "@relation" || token == "@RELATION") { parser >> token; relation = token; - LogInfo("ARFF/Attribute", "Relation set: " + relation); + debug::Log(kLog, "Relation set: " + relation); return; } parser >> token; attributeList.emplace_back(token); - LogInfo("ARFF/Attribute", "Added attribute: " + token); + debug::Log(kLog, "Added attribute: " + token); while (std::getline(parser, token, ',')) { // Clean token from outside pieces token.erase(std::remove(token.begin(), token.end(), ' '), token.end()); @@ -146,7 +133,7 @@ namespace ARFF { token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); attributeList.back().AddValue(token); - LogInfo("ARFF/Attribute", "Added value: " + token); + debug::Log(kLog, "Added value: " + token); } // Additional missing value case attributeList.back().AddValue("?"); @@ -159,14 +146,14 @@ namespace ARFF { int id = 0; if (!database.empty()) { id = database.back().id + 1; } database.emplace_back(id, attributeList.size()); - LogInfo("ARFF/Data", "Added id: " + std::to_string(database.back().id)); + debug::Log(kLog, "Added id: " + std::to_string(database.back().id)); for (int i = 0; i < attributeList.size(); ++i) { std::getline(parser, token, ','); token.erase(std::remove(token.begin(), token.end(), ' '), token.end()); token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); database.back().values.at(i) = token; - LogInfo("ARFF/Data", "Added instance value: " + token); + debug::Log(kLog, "Added instance value: " + token); } } @@ -174,25 +161,25 @@ namespace ARFF { for (Instance instance : database) { int successCheck = 0; for (int i = 0; i < attributeList.size(); ++i) { - LogInfo("ARFF/Integrity", "Instance value tested: '" + debug::Log(kTrace, "Instance value tested: '" + instance.values.at(i) + "'"); for (std::string value : attributeList.at(i).values) { - LogInfo("ARFF/Integrity", "attributeList value: '" + debug::Log(kTrace, "attributeList value: '" + value + "'"); if (instance.values.at(i) == value) { - LogInfo("ARFF/Integrity", "Value found: " + value); + debug::Log(kTrace, "Value found: " + value); ++successCheck; break; } } } if (successCheck != attributeList.size()) { - LogError("ARFF/Integrity", "Value size mismatch: " + debug::Log(kError, "Value size mismatch: " + std::to_string(successCheck) + " out of " + std::to_string(attributeList.size())); exit(1); } } - LogInfo("ARFF/Integrity", "All values exist, continuing..."); + debug::Log(kLog, "All values exist, continuing..."); } } diff --git a/src/arff/arff.hpp b/src/arff/arff.hpp index a38f688..50a8b42 100644 --- a/src/arff/arff.hpp +++ b/src/arff/arff.hpp @@ -7,8 +7,6 @@ namespace ARFF { void ParseArguments(int argc, char* argv[]); std::string GetDataFilename(void); - void LogInfo(const std::string location, const std::string message); - void LogError(const std::string location, const std::string message); struct AttributeType { public: diff --git a/src/arff/log.cpp b/src/arff/log.cpp new file mode 100644 index 0000000..4ccdeb7 --- /dev/null +++ b/src/arff/log.cpp @@ -0,0 +1,19 @@ +#include "log.hpp" +#include + +namespace debug { + bool verbose = false; + + void Log(LogLevel level, std::string message) { + std::string logMessage = ""; + if (!verbose && level > kNone) { return; } + switch (level) { + case kLog: logMessage += "[LOG] "; break; + case kWarn: logMessage += "[WARN] "; break; + case kError: logMessage += "[ERROR] "; break; + case kTrace: logMessage += "[TRACE] "; break; + } + logMessage += message; + std::cout << logMessage << std::endl; + } +} diff --git a/src/arff/log.hpp b/src/arff/log.hpp new file mode 100644 index 0000000..386ebae --- /dev/null +++ b/src/arff/log.hpp @@ -0,0 +1,19 @@ +#ifndef LOG_HPP +#define LOG_HPP + +#include + +enum LogLevel { + kNone = -1, + kLog = 0, + kWarn, + kError, + kTrace +}; + +namespace debug { + extern bool verbose; + void Log(LogLevel level, std::string message); +} + +#endif diff --git a/src/onerule/CMakeLists.txt b/src/onerule/CMakeLists.txt index 43ed139..90ced46 100644 --- a/src/onerule/CMakeLists.txt +++ b/src/onerule/CMakeLists.txt @@ -2,5 +2,6 @@ include_directories(../arff) add_executable(onerule main.cpp ../arff/arff.cpp + ../arff/log.cpp ) diff --git a/src/onerule/main.cpp b/src/onerule/main.cpp index ee0099d..1dce4fc 100644 --- a/src/onerule/main.cpp +++ b/src/onerule/main.cpp @@ -4,10 +4,12 @@ * Description: Read and store ARFF data from a file */ #include "arff.hpp" +#include "log.hpp" int main(int argc, char* argv[]) { ARFF::ParseArguments(argc, argv); ARFF::Arff data; data.Read(ARFF::GetDataFilename()); data.PrintOverview(); + debug::Log(kLog, "Test"); } From 182eed641b8fd1d9223c2281d6cdd58151e2dac6 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 21:03:49 -0500 Subject: [PATCH 6/8] oneR: got it working with weather, others are wrong, need to find problem --- src/arff/arff.cpp | 67 +++++++++++++++++++++++++++++++++++++++++++- src/arff/arff.hpp | 14 ++++++++- src/onerule/main.cpp | 3 +- 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp index 24c030e..9eb6025 100644 --- a/src/arff/arff.cpp +++ b/src/arff/arff.cpp @@ -41,6 +41,10 @@ namespace ARFF { this->values.resize(size); } + AttributeEvaluation::AttributeEvaluation(AttributeType *attribute) { + this->currentAttribute = attribute; + } + // Read entire data file and parse it void Arff::Read(std::string filename) { std::ifstream dataFile(filename); @@ -101,9 +105,17 @@ namespace ARFF { // Print result of applying OneR // TODO: Create function - void PrintOneR(void) { + void Arff::OneR(void) { + AttributeEvaluation bestAttribute = _OneR(); + debug::Log(kNone, "***Best 1-rule***"); + debug::Log(kNone, "\t" + bestAttribute.currentAttribute->attribute + ':'); + for (auto it : bestAttribute.rules) { + debug::Log(kNone, "\t\t" + it.first + " ---> " + it.second); + } + debug::Log(kNone, "Error rate: " + std::to_string(bestAttribute.totalError) + "/" + std::to_string(database.size())); } + // Add the attribute to the list void Arff::AddAttribute(std::string line) { std::stringstream parser(line); @@ -182,4 +194,57 @@ namespace ARFF { } debug::Log(kLog, "All values exist, continuing..."); } + + // Perform OneR on data that was previously read in + AttributeEvaluation Arff::_OneR(void) { + AttributeEvaluation bestEvaluation; + bestEvaluation.totalErrorRate = 1.0f; + // -1 used for ignoring test rule (eg, play=yes/no) + for (int i = 0; i < attributeList.size() - 1; ++i) { + AttributeEvaluation evaluation = EvaluateAttribute(&attributeList[i], i); + if (evaluation.totalErrorRate < bestEvaluation.totalErrorRate) { + bestEvaluation = evaluation; + bestEvaluation.currentAttribute = evaluation.currentAttribute; + } + debug::Log(kLog, "Evaluation on " + evaluation.currentAttribute->attribute + " completed"); + } + return bestEvaluation; + } + + // Determine error rate and best option for each value of an attribute + // Originally set up to use OneR + AttributeEvaluation Arff::EvaluateAttribute(AttributeType *attribute, const int attributePos) { + AttributeEvaluation evaluation(attribute); + std::map results; + for (std::string value : attributeList.end()->values) { + results.emplace(value, 0); + } + for (int i = 0; i < attribute->values.size(); ++i) { + if (attribute->values[i] == "?") { continue; } + for (auto instance = database.begin(); instance != database.end(); ++instance) { + if (instance->values[attributePos] != attribute->values[i]) { continue; } + ++results[instance->values.back()]; + } + debug::Log(kTrace, "Results:"); + for (auto it : results) { debug::Log(kTrace, "\t" + it.first + ": " + std::to_string(it.second)); } + int lowest = 9999; + std::string bestResult = results.begin()->first; + for (auto it = results.begin(); it != results.end(); ++it) { + if (it->second < lowest) { + lowest = it->second; + } else { + bestResult = it->first; + } + } + evaluation.rules.emplace(attribute->values[i], bestResult); + evaluation.totalError += lowest; + debug::Log(kLog, "Added rule " + attribute->values[i] + "->" + bestResult); + // Reset + for (auto it = results.begin(); it != results.end(); ++it) { + it->second = 0; + } + } + evaluation.totalErrorRate = evaluation.totalError / float(database.size()); + return evaluation; + } } diff --git a/src/arff/arff.hpp b/src/arff/arff.hpp index 50a8b42..3f76174 100644 --- a/src/arff/arff.hpp +++ b/src/arff/arff.hpp @@ -3,6 +3,7 @@ #include #include +#include namespace ARFF { void ParseArguments(int argc, char* argv[]); @@ -23,13 +24,22 @@ namespace ARFF { std::vector values; }; + struct AttributeEvaluation { + AttributeEvaluation() = default; + AttributeEvaluation(AttributeType *attribute); + AttributeType *currentAttribute; + std::map rules; + float totalErrorRate = 0.0f; + int totalError = 0; + }; + class Arff { public: Arff() = default; void Read(std::string filename); void PrintOverview(void); void PrintData(void); - void PrintOneR(void); + void OneR(void); private: std::string relation; std::vector attributeList; @@ -37,6 +47,8 @@ namespace ARFF { void AddAttribute(std::string line); void AddData(std::string line); void TestIntegrity(void); + AttributeEvaluation _OneR(void); + AttributeEvaluation EvaluateAttribute(AttributeType *attribute, const int attributePos); }; } diff --git a/src/onerule/main.cpp b/src/onerule/main.cpp index 1dce4fc..d6b66b3 100644 --- a/src/onerule/main.cpp +++ b/src/onerule/main.cpp @@ -4,12 +4,11 @@ * Description: Read and store ARFF data from a file */ #include "arff.hpp" -#include "log.hpp" int main(int argc, char* argv[]) { ARFF::ParseArguments(argc, argv); ARFF::Arff data; data.Read(ARFF::GetDataFilename()); data.PrintOverview(); - debug::Log(kLog, "Test"); + data.OneR(); } From 9a164e937032f01612bf0394b748542aa8c134d0 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 21:25:11 -0500 Subject: [PATCH 7/8] oneR: fixed using wrong method of calculating lowest error rate --- src/arff/arff.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp index 9eb6025..f625495 100644 --- a/src/arff/arff.cpp +++ b/src/arff/arff.cpp @@ -220,21 +220,24 @@ namespace ARFF { results.emplace(value, 0); } for (int i = 0; i < attribute->values.size(); ++i) { + int resultsTotal = 0; if (attribute->values[i] == "?") { continue; } for (auto instance = database.begin(); instance != database.end(); ++instance) { if (instance->values[attributePos] != attribute->values[i]) { continue; } ++results[instance->values.back()]; + resultsTotal++; } debug::Log(kTrace, "Results:"); for (auto it : results) { debug::Log(kTrace, "\t" + it.first + ": " + std::to_string(it.second)); } - int lowest = 9999; + float lowest; + float lowestRate = 1.0f; std::string bestResult = results.begin()->first; for (auto it = results.begin(); it != results.end(); ++it) { - if (it->second < lowest) { - lowest = it->second; - } else { + if (((resultsTotal - it->second) / float(resultsTotal)) < lowestRate) { + lowestRate = ((resultsTotal - it->second) / float(resultsTotal)); + lowest = resultsTotal - it->second; bestResult = it->first; - } + } } evaluation.rules.emplace(attribute->values[i], bestResult); evaluation.totalError += lowest; From 2b514fe5434dc3c9612d6e97c36ecb67fb683609 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Wed, 17 Apr 2024 21:35:30 -0500 Subject: [PATCH 8/8] Added required comments --- README.md | 52 ++++++++++++++++++++++++++++++++++++++++++++ src/onerule/main.cpp | 2 +- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index af083f7..fd11095 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ # arff-mining +## Author, Date, Description +- Gregory Crawford +- Date: 2024-04-17 +- Description: Read data from ARFF file and generate OneR output +- Performance summary at the bottom of this README + + ## Compiling the project Prerequisites @@ -10,6 +17,7 @@ In order to compile the project, simply run these two commands: cmake -B build -S . cmake --build build + ## Running the Project The programs should now be compiled at ./build/bin/ @@ -23,3 +31,47 @@ One Rule: ```plain build/bin/onerule ``` + + +## Performance Summary + +***contactLenses*** +``` + tear-prod-rate: + normal ---> soft + reduced ---> none +Error rate: 7/24 +build/bin/onerule 0.00s user 0.00s system 85% cpu 0.002 total +``` + +***restaurants*** +``` + Pat: + Full ---> No + None ---> No + Some ---> Yes +Error rate: 2/12 +build/bin/onerule 0.00s user 0.00s system 85% cpu 0.001 total +``` + +***soybean*** +``` + fruit-spots: + absent ---> alternarialeaf-spot + brown-w/blk-specks ---> anthracnose + colored ---> frog-eye-leaf-spot + distort ---> alternarialeaf-spot + dna ---> brown-stem-rot +Error rate: 385/683 +build/bin/onerule 0.02s user 0.00s system 98% cpu 0.024 total +``` + +***weather*** +``` + outlook: + overcast ---> yes + rainy ---> yes + sunny ---> no +Error rate: 4/14 +build/bin/onerule 0.00s user 0.00s system 87% cpu 0.001 total +``` diff --git a/src/onerule/main.cpp b/src/onerule/main.cpp index d6b66b3..1fea221 100644 --- a/src/onerule/main.cpp +++ b/src/onerule/main.cpp @@ -1,7 +1,7 @@ /* * Author: Gregory Crawford * Date: 2024-03-18 - * Description: Read and store ARFF data from a file + * Description: Read data from ARFF file and generate OneR output */ #include "arff.hpp"