diff --git a/.gitignore b/.gitignore index e257658..eb18772 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ *.out *.app +# Build directory +build diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..5fca557 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.10) + +project( + weka + LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ standard to use") +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) + +add_subdirectory(src) diff --git a/README.md b/README.md index ab57733..2190089 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,20 @@ -# weka +# arff-mining +## Compiling the project + +Prerequisites + - C++11 + +In order to compile the project, simply run these two commands: + + cmake -B build -S . + cmake --build build + +## Running the Project + +The programs should now be compiled at ./build/bin/ + +ARFF: +```plain +build/bin/arff +``` diff --git a/data/contact-lenses.arff b/data/contact-lenses.arff new file mode 100644 index 0000000..bf86d95 --- /dev/null +++ b/data/contact-lenses.arff @@ -0,0 +1,52 @@ +% Title: Database for fitting contact lenses +% +% Number of Instances: 24 +% +% Number of Attributes: 4 (all nominal) +% +% Attribute Information -- 3 Classes: +% 1 : the patient should be fitted with hard contact lenses, +% 2 : the patient should be fitted with soft contact lenses, +% 3 : the patient should not be fitted with contact lenses. +% +% Class Distribution: +% 1. hard contact lenses: 4 +% 2. soft contact lenses: 5 +% 3. no contact lenses: 15 + +@relation contact-lenses + +@attribute age {young, pre-presbyopic, presbyopic} +@attribute spectacle-prescrip {myope, hypermetrope} +@attribute astigmatism {no, yes} +@attribute tear-prod-rate {reduced, normal} +@attribute contact-lenses {soft, hard, none} + +@data +% +% 24 instances +% +young,myope,no,reduced,none +young,myope,no,normal,soft +young,myope,yes,reduced,none +young,myope,yes,normal,hard +young,hypermetrope,no,reduced,none +young,hypermetrope,no,normal,soft +young,hypermetrope,yes,reduced,none +young,hypermetrope,yes,normal,hard +pre-presbyopic,myope,no,reduced,none +pre-presbyopic,myope,no,normal,soft +pre-presbyopic,myope,yes,reduced,none +pre-presbyopic,myope,yes,normal,hard +pre-presbyopic,hypermetrope,no,reduced,none +pre-presbyopic,hypermetrope,no,normal,soft +pre-presbyopic,hypermetrope,yes,reduced,none +pre-presbyopic,hypermetrope,yes,normal,none +presbyopic,myope,no,reduced,none +presbyopic,myope,no,normal,none +presbyopic,myope,yes,reduced,none +presbyopic,myope,yes,normal,hard +presbyopic,hypermetrope,no,reduced,none +presbyopic,hypermetrope,no,normal,soft +presbyopic,hypermetrope,yes,reduced,none +presbyopic,hypermetrope,yes,normal,none diff --git a/data/restaurant.arff b/data/restaurant.arff new file mode 100644 index 0000000..5ed140c --- /dev/null +++ b/data/restaurant.arff @@ -0,0 +1,28 @@ +@relation restaurant +% determine whether a customer will wait for a table or not + +@attribute Alt {Yes, No} +@attribute Bar {Yes, No} +@attribute Fri {Yes, No} +@attribute Hun {Yes, No} +@attribute Pat {Some, Full, None} +@attribute Price {$, $$, $$$} +@attribute Rain {Yes, No} +@attribute Res {Yes, No} +@attribute Type {French, Thai, Burger, Italian} +@attribute Est {0-10, 10-30, 30-60, >60} +@attribute Wait {Yes, No} + +@data +Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10,Yes +Yes,No,No,Yes,Full,$,No,No,Thai,30-60,No +No,Yes,No,No,Some,$,No,No,Burger,0-10,Yes +Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30,Yes +Yes,No,Yes,No,Full,$$$,No,Yes,French,>60,No +No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10,Yes +No,Yes,No,No,None,$,Yes,No,Burger,0-10,No +No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10,Yes +No,Yes,Yes,No,Full,$,Yes,No,Burger,>60,No +Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30,No +No,No,No,No,None,$,No,No,Thai,0-10,No +Yes,Yes,Yes,Yes,Full,$,No,No,Burger,30-60,Yes diff --git a/src/intro/soybean.arff b/data/soybean.arff similarity index 100% rename from src/intro/soybean.arff rename to data/soybean.arff diff --git a/src/intro/weather.nominal.arff b/data/weather.nominal.arff similarity index 100% rename from src/intro/weather.nominal.arff rename to data/weather.nominal.arff diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..9e6d78f --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(arff) diff --git a/src/arff/CMakeLists.txt b/src/arff/CMakeLists.txt new file mode 100644 index 0000000..524fe0c --- /dev/null +++ b/src/arff/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(arff + ./main.cpp + ./arff.cpp +) + diff --git a/src/arff/arff.cpp b/src/arff/arff.cpp new file mode 100644 index 0000000..49dc8ef --- /dev/null +++ b/src/arff/arff.cpp @@ -0,0 +1,182 @@ +#include "arff.hpp" +#include +#include +#include +#include + +namespace ARFF { + bool isVerbose = false; + + void ParseArguments(int argc, char* argv[]) { + std::string argument_string; + for (int i = 0; i < argc; ++i) { + argument_string.assign(argv[i]); + if (argument_string == "-v" || argument_string == "--verbose") { + isVerbose = true; + } + } + } + + std::string GetDataFilename(void) { + std::string filename; + std::cout << "Please enter name of the data file:\t"; + std::cin >> filename; + if (filename.empty()) { + LogError("ARFF/Setup", "No data filename provided, exiting..."); + exit(1); + } + std::cout << std::endl; + return filename; + } + + void LogInfo(const std::string location, const std::string message) { + if (!isVerbose) { return; } + std::cout << '[' << location << " - INFO] "; + std::cout << message << std::endl; + } + + void LogError(const std::string location, const std::string message) { + if (!isVerbose) { return; } + std::cerr << '[' << location << " - ERROR] "; + std::cerr << message << std::endl; + } + + AttributeType::AttributeType(std::string attribute) { + this->attribute = attribute; + } + + void AttributeType::AddValue(std::string value) { + values.emplace_back(value); + } + + Instance::Instance(const int id, const int size) { + this->id = id; + this->values.resize(size); + } + + // Read entire data file and parse it + void Arff::Read(std::string filename) { + std::ifstream dataFile(filename); + if (!dataFile.is_open()) { + LogError("ARFF/Read", "Unable to open file with name `" + + filename + ", exiting..."); + exit(1); + } + std::string line; + while (std::getline(dataFile, line)) { + if (line.size() == 1) { continue; } + switch (line.at(0)) { + case '%': + // Comment line in data + continue; + break; + case '@': + AddAttribute(line); + break; + default: + AddData(line); + break; + } + } + TestIntegrity(); + } + + void Arff::Print(void) { + std::cout << attributeList.size() << " attributes\n"; + std::cout << database.size() << " examples\n"; + std::cout << std::endl; + + std::cout << "Attribute (#): values\n"; + for (AttributeType type : attributeList) { + std::cout << type.attribute << " (" << type.values.size() << "):"; + for (std::string value : type.values) { + std::cout << " " << value; + } + std::cout << '\n'; + } + std::cout << std::endl; + + std::cout << relation << '\n'; + for (Instance instance: database) { + for (std::string value : instance.values) { + std::cout << '\t' << value; + } + std::cout << '\n'; + } + std::cout << std::endl; + } + + // Add the attribute to the list + void Arff::AddAttribute(std::string line) { + std::stringstream parser(line); + std::string token; + parser >> token; + // Signifies beginning of data + // Might add a boolean later to mark this + if (token == "@data") { + return; + } + if (token == "@relation") { + parser >> token; + relation = token; + LogInfo("ARFF/Attribute", "Relation set: " + relation); + return; + } + parser >> token; + attributeList.emplace_back(token); + LogInfo("ARFF/Attribute", "Added attribute: " + token); + while (parser >> token) { + // Clean token from outside pieces + token.erase(std::remove(token.begin(), token.end(), '{'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '}'), token.end()); + token.erase(std::remove(token.begin(), token.end(), ','), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); + attributeList.back().AddValue(token); + LogInfo("ARFF/Attribute", "Added value: " + token); + } + } + + // Add data to runtime database + void Arff::AddData(std::string line) { + std::istringstream parser(line); + std::string token; + int id = 0; + if (!database.empty()) { id = database.back().id + 1; } + database.emplace_back(id, attributeList.size()); + LogInfo("ARFF/Data", "Added id: " + std::to_string(database.back().id)); + for (int i = 0; i < attributeList.size(); ++i) { + std::getline(parser, token, ','); + token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); + token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); + database.back().values.at(i) = token; + LogInfo("ARFF/Data", "Added instance value: " + token); + } + } + + void Arff::TestIntegrity(void) { + for (Instance instance : database) { + int successCheck = 0; + for (int i = 0; i < attributeList.size(); ++i) { + LogInfo("ARFF/Integrity", "Instance value tested: '" + + instance.values.at(i) + "'"); + for (std::string value : attributeList.at(i).values) { + LogInfo("ARFF/Integrity", "attributeList value: '" + + value + "'"); + if (instance.values.at(i) == value) { + LogInfo("ARFF/Integrity", "Value found: " + value); + ++successCheck; + break; + } + } + } + if (successCheck != attributeList.size()) { + LogError("ARFF/Integrity", "Value size mismatch: " + + std::to_string(successCheck) + " out of " + + std::to_string(attributeList.size())); + exit(1); + } + } + LogInfo("ARFF/Integrity", "All values exist, continuing..."); + } +} diff --git a/src/arff/arff.hpp b/src/arff/arff.hpp new file mode 100644 index 0000000..8afc7af --- /dev/null +++ b/src/arff/arff.hpp @@ -0,0 +1,44 @@ +#ifndef ARFF_HPP +#define ARFF_HPP + +#include +#include + +namespace ARFF { + void ParseArguments(int argc, char* argv[]); + std::string GetDataFilename(void); + void LogInfo(const std::string location, const std::string message); + void LogError(const std::string location, const std::string message); + + struct AttributeType { + public: + std::string attribute; + std::vector values; + AttributeType(std::string attribute); + void AddValue(std::string value); + }; + + struct Instance { + public: + Instance(const int id, const int size); + unsigned int id; + std::vector values; + }; + + class Arff { + public: + Arff() = default; + void Read(std::string filename); + void Print(void); + private: + std::string relation; + std::vector attributeList; + std::vector database; + void AddAttribute(std::string line); + void AddData(std::string line); + void TestIntegrity(void); + }; + +} + +#endif diff --git a/src/arff/main.cpp b/src/arff/main.cpp new file mode 100644 index 0000000..4f24fae --- /dev/null +++ b/src/arff/main.cpp @@ -0,0 +1,13 @@ +/* + * Author: Gregory Crawford + * Date: 2024-03-18 + * Description: Read and store ARFF data from a file + */ +#include "arff.hpp" + +int main(int argc, char* argv[]) { + ARFF::ParseArguments(argc, argv); + ARFF::Arff data; + data.Read(ARFF::GetDataFilename()); + data.Print(); +}