From c17a1980032bca1666e10c3f0507ce4b2f056396 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:02:24 -0500 Subject: [PATCH 1/8] Moved arff files to data folder --- data/contact-lenses.arff | 52 ++++++++++++++++++++++++ data/restaurant.arff | 28 +++++++++++++ {src/intro => data}/soybean.arff | 0 {src/intro => data}/weather.nominal.arff | 0 4 files changed, 80 insertions(+) create mode 100644 data/contact-lenses.arff create mode 100644 data/restaurant.arff rename {src/intro => data}/soybean.arff (100%) rename {src/intro => data}/weather.nominal.arff (100%) diff --git a/data/contact-lenses.arff b/data/contact-lenses.arff new file mode 100644 index 0000000..bf86d95 --- /dev/null +++ b/data/contact-lenses.arff @@ -0,0 +1,52 @@ +% Title: Database for fitting contact lenses +% +% Number of Instances: 24 +% +% Number of Attributes: 4 (all nominal) +% +% Attribute Information -- 3 Classes: +% 1 : the patient should be fitted with hard contact lenses, +% 2 : the patient should be fitted with soft contact lenses, +% 3 : the patient should not be fitted with contact lenses. +% +% Class Distribution: +% 1. hard contact lenses: 4 +% 2. soft contact lenses: 5 +% 3. no contact lenses: 15 + +@relation contact-lenses + +@attribute age {young, pre-presbyopic, presbyopic} +@attribute spectacle-prescrip {myope, hypermetrope} +@attribute astigmatism {no, yes} +@attribute tear-prod-rate {reduced, normal} +@attribute contact-lenses {soft, hard, none} + +@data +% +% 24 instances +% +young,myope,no,reduced,none +young,myope,no,normal,soft +young,myope,yes,reduced,none +young,myope,yes,normal,hard +young,hypermetrope,no,reduced,none +young,hypermetrope,no,normal,soft +young,hypermetrope,yes,reduced,none +young,hypermetrope,yes,normal,hard +pre-presbyopic,myope,no,reduced,none +pre-presbyopic,myope,no,normal,soft +pre-presbyopic,myope,yes,reduced,none +pre-presbyopic,myope,yes,normal,hard +pre-presbyopic,hypermetrope,no,reduced,none +pre-presbyopic,hypermetrope,no,normal,soft +pre-presbyopic,hypermetrope,yes,reduced,none +pre-presbyopic,hypermetrope,yes,normal,none +presbyopic,myope,no,reduced,none +presbyopic,myope,no,normal,none +presbyopic,myope,yes,reduced,none +presbyopic,myope,yes,normal,hard +presbyopic,hypermetrope,no,reduced,none +presbyopic,hypermetrope,no,normal,soft +presbyopic,hypermetrope,yes,reduced,none +presbyopic,hypermetrope,yes,normal,none diff --git a/data/restaurant.arff b/data/restaurant.arff new file mode 100644 index 0000000..5ed140c --- /dev/null +++ b/data/restaurant.arff @@ -0,0 +1,28 @@ +@relation restaurant +% determine whether a customer will wait for a table or not + +@attribute Alt {Yes, No} +@attribute Bar {Yes, No} +@attribute Fri {Yes, No} +@attribute Hun {Yes, No} +@attribute Pat {Some, Full, None} +@attribute Price {$, $$, $$$} +@attribute Rain {Yes, No} +@attribute Res {Yes, No} +@attribute Type {French, Thai, Burger, Italian} +@attribute Est {0-10, 10-30, 30-60, >60} +@attribute Wait {Yes, No} + +@data +Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10,Yes +Yes,No,No,Yes,Full,$,No,No,Thai,30-60,No +No,Yes,No,No,Some,$,No,No,Burger,0-10,Yes +Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30,Yes +Yes,No,Yes,No,Full,$$$,No,Yes,French,>60,No +No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10,Yes +No,Yes,No,No,None,$,Yes,No,Burger,0-10,No +No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10,Yes +No,Yes,Yes,No,Full,$,Yes,No,Burger,>60,No +Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30,No +No,No,No,No,None,$,No,No,Thai,0-10,No +Yes,Yes,Yes,Yes,Full,$,No,No,Burger,30-60,Yes diff --git a/src/intro/soybean.arff b/data/soybean.arff similarity index 100% rename from src/intro/soybean.arff rename to data/soybean.arff diff --git a/src/intro/weather.nominal.arff b/data/weather.nominal.arff similarity index 100% rename from src/intro/weather.nominal.arff rename to data/weather.nominal.arff From 917dbec7a150b531dc661371edea0f4d5046fd29 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:03:27 -0500 Subject: [PATCH 2/8] Created initial cpp file --- src/arff/main.cpp | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 src/arff/main.cpp diff --git a/src/arff/main.cpp b/src/arff/main.cpp new file mode 100644 index 0000000..df55c5a --- /dev/null +++ b/src/arff/main.cpp @@ -0,0 +1,5 @@ +#include + +int main(void) { + std::cout << "Hello world" << std::endl; +} From 4c29af235e829921c00d59def836f0ff87b1e2ad Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:09:48 -0500 Subject: [PATCH 3/8] Added base README instructions --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index ab57733..a2772b3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,20 @@ # weka +## Compiling the project + +Prerequisites + - C++11 + +In order to compile the project, simply run these two commands: + + cmake -B build -S . + cmake --build build + +## Running the Project + +The programs should now be compiled at ./build/bin/ + +ARFF: +```plain +build/bin/something +``` From 637cde0e97ac70b1cea5849bbca8677135bb733f Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:13:40 -0500 Subject: [PATCH 4/8] Add gitignore for cmake build --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index e257658..eb18772 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ *.out *.app +# Build directory +build From e0d93f7bccfc14131aef8a4ef71c364a9c8a9b76 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:15:10 -0500 Subject: [PATCH 5/8] Added CMake building for current and future of project --- CMakeLists.txt | 12 ++++++++++++ src/CMakeLists.txt | 1 + src/arff/CMakeLists.txt | 4 ++++ 3 files changed, 17 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 src/CMakeLists.txt create mode 100644 src/arff/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..5fca557 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.10) + +project( + weka + LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ standard to use") +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) + +add_subdirectory(src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..9e6d78f --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(arff) diff --git a/src/arff/CMakeLists.txt b/src/arff/CMakeLists.txt new file mode 100644 index 0000000..2b5d56b --- /dev/null +++ b/src/arff/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(arff + ./main.cpp +) + From d30428dc4f82ddc60daa42217856cb73daebe8fb Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 19:19:47 -0500 Subject: [PATCH 6/8] Added reading in data and a verbose mode --- src/arff/CMakeLists.txt | 1 + src/arff/arff.cpp | 157 ++++++++++++++++++++++++++++++++++++++++ src/arff/arff.hpp | 43 +++++++++++ src/arff/main.cpp | 13 +++- 4 files changed, 211 insertions(+), 3 deletions(-) create mode 100644 src/arff/arff.cpp create mode 100644 src/arff/arff.hpp diff --git a/src/arff/CMakeLists.txt b/src/arff/CMakeLists.txt index 2b5d56b..524fe0c 100644 --- a/src/arff/CMakeLists.txt +++ b/src/arff/CMakeLists.txt @@ -1,4 +1,5 @@ add_executable(arff ./main.cpp + ./arff.cpp ) diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp new file mode 100644 index 0000000..a247e78 --- /dev/null +++ b/src/arff/arff.cpp @@ -0,0 +1,157 @@ +#include "arff.hpp" +#include +#include +#include +#include + +namespace ARFF { + bool isVerbose = false; + + void ParseArguments(int argc, char* argv[]) { + std::string argument_string; + for (int i = 0; i < argc; ++i) { + argument_string.assign(argv[i]); + if (argument_string == "-v" || argument_string == "--verbose") { + isVerbose = true; + } + } + } + + std::string GetDataFilename(void) { + std::string filename; + std::cout << "Please enter name of the data file:\t"; + std::cin >> filename; + if (filename.empty()) { + LogError("ARFF/Setup", "No data filename provided, exiting..."); + exit(1); + } + std::cout << std::endl; + return filename; + } + + void LogInfo(const std::string location, const std::string message) { + if (!isVerbose) { return; } + std::cout << '[' << location << " - INFO] "; + std::cout << message << std::endl; + } + + void LogError(const std::string location, const std::string message) { + if (!isVerbose) { return; } + std::cerr << '[' << location << " - ERROR] "; + std::cerr << message << std::endl; + } + + AttributeType::AttributeType(std::string attribute) { + this->attribute = attribute; + } + + void AttributeType::AddValue(std::string value) { + values.emplace_back(value); + } + + Instance::Instance(const int id, const int size) { + this->id = id; + this->values.resize(size); + } + + // Read entire data file and parse it + void Arff::Read(std::string filename) { + std::ifstream dataFile(filename); + if (!dataFile.is_open()) { + LogError("ARFF/Read", "Unable to open file with name `" + + filename + ", exiting..."); + exit(1); + } + std::string line; + while (std::getline(dataFile, line)) { + if (line.size() == 1) { continue; } + switch (line.at(0)) { + case '%': + // Comment line in data + continue; + break; + case '@': + AddAttribute(line); + break; + default: + AddData(line); + break; + } + } + TestIntegrity(); + } + + // Add the attribute to the list + void Arff::AddAttribute(std::string line) { + std::stringstream parser(line); + std::string token; + parser >> token; + // Signifies beginning of data + // Might add a boolean later to mark this + if (token == "@data") { + return; + } + if (token == "@relation") { + parser >> token; + relation = token; + LogInfo("ARFF/Attribute", "Relation set: " + relation); + return; + } + parser >> token; + attributeList.emplace_back(token); + LogInfo("ARFF/Attribute", "Added attribute: " + token); + while (parser >> token) { + // Clean token from outside pieces + token.erase(std::remove(token.begin(), token.end(), '{'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '}'), token.end()); + token.erase(std::remove(token.begin(), token.end(), ','), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); + attributeList.back().AddValue(token); + LogInfo("ARFF/Attribute", "Added value: " + token); + } + } + + // Add data to runtime database + void Arff::AddData(std::string line) { + std::istringstream parser(line); + std::string token; + int id = 0; + if (!database.empty()) { id = database.back().id + 1; } + database.emplace_back(id, attributeList.size()); + LogInfo("ARFF/Data", "Added id: " + std::to_string(database.back().id)); + for (int i = 0; i < attributeList.size(); ++i) { + std::getline(parser, token, ','); + token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); + database.back().values.at(i) = token; + LogInfo("ARFF/Data", "Added instance value: " + token); + } + } + + void Arff::TestIntegrity(void) { + for (Instance instance : database) { + int successCheck = 0; + for (int i = 0; i < attributeList.size(); ++i) { + LogInfo("ARFF/Integrity", "Instance value tested: '" + + instance.values.at(i) + "'"); + for (std::string value : attributeList.at(i).values) { + LogInfo("ARFF/Integrity", "attributeList value: '" + + value + "'"); + if (instance.values.at(i) == value) { + LogInfo("ARFF/Integrity", "Value found: " + value); + ++successCheck; + break; + } + } + } + if (successCheck != attributeList.size()) { + LogError("ARFF/Integrity", "Value size mismatch: " + + std::to_string(successCheck) + " out of " + + std::to_string(attributeList.size())); + exit(1); + } + } + LogInfo("ARFF/Integrity", "All values exist, continuing..."); + } +} diff --git a/src/arff/arff.hpp b/src/arff/arff.hpp new file mode 100644 index 0000000..7334c85 --- /dev/null +++ b/src/arff/arff.hpp @@ -0,0 +1,43 @@ +#ifndef ARFF_HPP +#define ARFF_HPP + +#include +#include + +namespace ARFF { + void ParseArguments(int argc, char* argv[]); + std::string GetDataFilename(void); + void LogInfo(const std::string location, const std::string message); + void LogError(const std::string location, const std::string message); + + struct AttributeType { + public: + std::string attribute; + std::vector values; + AttributeType(std::string attribute); + void AddValue(std::string value); + }; + + struct Instance { + public: + Instance(const int id, const int size); + unsigned int id; + std::vector values; + }; + + class Arff { + public: + Arff() = default; + void Read(std::string filename); + private: + std::string relation; + std::vector attributeList; + std::vector database; + void AddAttribute(std::string line); + void AddData(std::string line); + void TestIntegrity(void); + }; + +} + +#endif diff --git a/src/arff/main.cpp b/src/arff/main.cpp index df55c5a..11caf3d 100644 --- a/src/arff/main.cpp +++ b/src/arff/main.cpp @@ -1,5 +1,12 @@ -#include +/* + * Author: Gregory Crawford + * Date: 2024-03-18 + * Description: Read and store ARFF data from a file + */ +#include "arff.hpp" -int main(void) { - std::cout << "Hello world" << std::endl; +int main(int argc, char* argv[]) { + ARFF::ParseArguments(argc, argv); + ARFF::Arff data; + data.Read(ARFF::GetDataFilename()); } From e32a65a660962da24d70588c3995cf2b71fad0b5 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 19:52:05 -0500 Subject: [PATCH 7/8] Added print function for outputting database and attributes --- src/arff/arff.cpp | 25 +++++++++++++++++++++++++ src/arff/arff.hpp | 1 + src/arff/main.cpp | 1 + 3 files changed, 27 insertions(+) diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp index a247e78..49dc8ef 100644 --- a/src/arff/arff.cpp +++ b/src/arff/arff.cpp @@ -81,6 +81,31 @@ namespace ARFF { TestIntegrity(); } + void Arff::Print(void) { + std::cout << attributeList.size() << " attributes\n"; + std::cout << database.size() << " examples\n"; + std::cout << std::endl; + + std::cout << "Attribute (#): values\n"; + for (AttributeType type : attributeList) { + std::cout << type.attribute << " (" << type.values.size() << "):"; + for (std::string value : type.values) { + std::cout << " " << value; + } + std::cout << '\n'; + } + std::cout << std::endl; + + std::cout << relation << '\n'; + for (Instance instance: database) { + for (std::string value : instance.values) { + std::cout << '\t' << value; + } + std::cout << '\n'; + } + std::cout << std::endl; + } + // Add the attribute to the list void Arff::AddAttribute(std::string line) { std::stringstream parser(line); diff --git a/src/arff/arff.hpp b/src/arff/arff.hpp index 7334c85..8afc7af 100644 --- a/src/arff/arff.hpp +++ b/src/arff/arff.hpp @@ -29,6 +29,7 @@ namespace ARFF { public: Arff() = default; void Read(std::string filename); + void Print(void); private: std::string relation; std::vector attributeList; diff --git a/src/arff/main.cpp b/src/arff/main.cpp index 11caf3d..4f24fae 100644 --- a/src/arff/main.cpp +++ b/src/arff/main.cpp @@ -9,4 +9,5 @@ int main(int argc, char* argv[]) { ARFF::ParseArguments(argc, argv); ARFF::Arff data; data.Read(ARFF::GetDataFilename()); + data.Print(); } From a9d9dd8b79234ce753735d0599ccde5375fa7b28 Mon Sep 17 00:00:00 2001 From: Trianta <56975502+Trimutex@users.noreply.github.com> Date: Mon, 18 Mar 2024 19:59:09 -0500 Subject: [PATCH 8/8] New repository name and fixed exec path --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a2772b3..2190089 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# weka +# arff-mining ## Compiling the project @@ -16,5 +16,5 @@ The programs should now be compiled at ./build/bin/ ARFF: ```plain -build/bin/something +build/bin/arff ```