feat: read data from arff files #1
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -32,3 +32,5 @@
 | 
			
		||||
*.out
 | 
			
		||||
*.app
 | 
			
		||||
 | 
			
		||||
# Build directory
 | 
			
		||||
build
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										12
									
								
								CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,12 @@
 | 
			
		||||
cmake_minimum_required(VERSION 3.10)
 | 
			
		||||
 | 
			
		||||
project(
 | 
			
		||||
    weka
 | 
			
		||||
    LANGUAGES CXX)
 | 
			
		||||
 | 
			
		||||
set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ standard to use")
 | 
			
		||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
 | 
			
		||||
set(CMAKE_CXX_EXTENSIONS OFF)
 | 
			
		||||
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 | 
			
		||||
 | 
			
		||||
add_subdirectory(src)
 | 
			
		||||
							
								
								
									
										20
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								README.md
									
									
									
									
									
								
							@ -1,2 +1,20 @@
 | 
			
		||||
# weka
 | 
			
		||||
# arff-mining
 | 
			
		||||
 | 
			
		||||
## Compiling the project
 | 
			
		||||
 | 
			
		||||
Prerequisites
 | 
			
		||||
  - C++11
 | 
			
		||||
 | 
			
		||||
In order to compile the project, simply run these two commands:
 | 
			
		||||
 | 
			
		||||
    cmake -B build -S .
 | 
			
		||||
    cmake --build build
 | 
			
		||||
 | 
			
		||||
## Running the Project
 | 
			
		||||
 | 
			
		||||
The programs should now be compiled at ./build/bin/
 | 
			
		||||
 | 
			
		||||
ARFF:
 | 
			
		||||
```plain
 | 
			
		||||
build/bin/arff
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										52
									
								
								data/contact-lenses.arff
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								data/contact-lenses.arff
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,52 @@
 | 
			
		||||
% Title: Database for fitting contact lenses
 | 
			
		||||
% 
 | 
			
		||||
% Number of Instances: 24
 | 
			
		||||
% 
 | 
			
		||||
% Number of Attributes: 4 (all nominal)
 | 
			
		||||
% 
 | 
			
		||||
% Attribute Information -- 3 Classes:
 | 
			
		||||
%   1 : the patient should be fitted with hard contact lenses,
 | 
			
		||||
%   2 : the patient should be fitted with soft contact lenses,
 | 
			
		||||
%   3 : the patient should not be fitted with contact lenses.
 | 
			
		||||
%  
 | 
			
		||||
% Class Distribution:
 | 
			
		||||
%    1. hard contact lenses: 4
 | 
			
		||||
%    2. soft contact lenses: 5
 | 
			
		||||
%    3. no contact lenses: 15
 | 
			
		||||
 | 
			
		||||
@relation contact-lenses
 | 
			
		||||
 | 
			
		||||
@attribute age 			{young, pre-presbyopic, presbyopic}
 | 
			
		||||
@attribute spectacle-prescrip	{myope, hypermetrope}
 | 
			
		||||
@attribute astigmatism		{no, yes}
 | 
			
		||||
@attribute tear-prod-rate	{reduced, normal}
 | 
			
		||||
@attribute contact-lenses	{soft, hard, none}
 | 
			
		||||
 | 
			
		||||
@data
 | 
			
		||||
%
 | 
			
		||||
% 24 instances
 | 
			
		||||
%
 | 
			
		||||
young,myope,no,reduced,none
 | 
			
		||||
young,myope,no,normal,soft
 | 
			
		||||
young,myope,yes,reduced,none
 | 
			
		||||
young,myope,yes,normal,hard
 | 
			
		||||
young,hypermetrope,no,reduced,none
 | 
			
		||||
young,hypermetrope,no,normal,soft
 | 
			
		||||
young,hypermetrope,yes,reduced,none
 | 
			
		||||
young,hypermetrope,yes,normal,hard
 | 
			
		||||
pre-presbyopic,myope,no,reduced,none
 | 
			
		||||
pre-presbyopic,myope,no,normal,soft
 | 
			
		||||
pre-presbyopic,myope,yes,reduced,none
 | 
			
		||||
pre-presbyopic,myope,yes,normal,hard
 | 
			
		||||
pre-presbyopic,hypermetrope,no,reduced,none
 | 
			
		||||
pre-presbyopic,hypermetrope,no,normal,soft
 | 
			
		||||
pre-presbyopic,hypermetrope,yes,reduced,none
 | 
			
		||||
pre-presbyopic,hypermetrope,yes,normal,none
 | 
			
		||||
presbyopic,myope,no,reduced,none
 | 
			
		||||
presbyopic,myope,no,normal,none
 | 
			
		||||
presbyopic,myope,yes,reduced,none
 | 
			
		||||
presbyopic,myope,yes,normal,hard
 | 
			
		||||
presbyopic,hypermetrope,no,reduced,none
 | 
			
		||||
presbyopic,hypermetrope,no,normal,soft
 | 
			
		||||
presbyopic,hypermetrope,yes,reduced,none
 | 
			
		||||
presbyopic,hypermetrope,yes,normal,none
 | 
			
		||||
							
								
								
									
										28
									
								
								data/restaurant.arff
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								data/restaurant.arff
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,28 @@
 | 
			
		||||
@relation restaurant
 | 
			
		||||
% determine whether a customer will wait for a table or not
 | 
			
		||||
 | 
			
		||||
@attribute Alt {Yes, No}
 | 
			
		||||
@attribute Bar {Yes, No}
 | 
			
		||||
@attribute Fri {Yes, No}
 | 
			
		||||
@attribute Hun {Yes, No}
 | 
			
		||||
@attribute Pat {Some, Full, None}
 | 
			
		||||
@attribute Price {$, $$, $$$}
 | 
			
		||||
@attribute Rain {Yes, No}
 | 
			
		||||
@attribute Res {Yes, No}
 | 
			
		||||
@attribute Type {French, Thai, Burger, Italian}
 | 
			
		||||
@attribute Est {0-10, 10-30, 30-60, >60}
 | 
			
		||||
@attribute Wait {Yes, No}
 | 
			
		||||
 | 
			
		||||
@data
 | 
			
		||||
Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10,Yes
 | 
			
		||||
Yes,No,No,Yes,Full,$,No,No,Thai,30-60,No
 | 
			
		||||
No,Yes,No,No,Some,$,No,No,Burger,0-10,Yes
 | 
			
		||||
Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30,Yes
 | 
			
		||||
Yes,No,Yes,No,Full,$$$,No,Yes,French,>60,No
 | 
			
		||||
No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10,Yes
 | 
			
		||||
No,Yes,No,No,None,$,Yes,No,Burger,0-10,No
 | 
			
		||||
No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10,Yes
 | 
			
		||||
No,Yes,Yes,No,Full,$,Yes,No,Burger,>60,No
 | 
			
		||||
Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30,No
 | 
			
		||||
No,No,No,No,None,$,No,No,Thai,0-10,No
 | 
			
		||||
Yes,Yes,Yes,Yes,Full,$,No,No,Burger,30-60,Yes
 | 
			
		||||
							
								
								
									
										1
									
								
								src/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
add_subdirectory(arff)
 | 
			
		||||
							
								
								
									
										5
									
								
								src/arff/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								src/arff/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
add_executable(arff
 | 
			
		||||
    ./main.cpp
 | 
			
		||||
    ./arff.cpp
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										182
									
								
								src/arff/arff.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										182
									
								
								src/arff/arff.cpp
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,182 @@
 | 
			
		||||
#include "arff.hpp"
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <fstream>
 | 
			
		||||
#include <sstream>
 | 
			
		||||
#include <algorithm>
 | 
			
		||||
 | 
			
		||||
namespace ARFF {
 | 
			
		||||
    bool isVerbose = false;
 | 
			
		||||
 | 
			
		||||
    void ParseArguments(int argc, char* argv[]) {
 | 
			
		||||
        std::string argument_string;
 | 
			
		||||
        for (int i = 0; i < argc; ++i) {
 | 
			
		||||
            argument_string.assign(argv[i]);
 | 
			
		||||
            if (argument_string == "-v" || argument_string == "--verbose") { 
 | 
			
		||||
                isVerbose = true;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::string GetDataFilename(void) {
 | 
			
		||||
        std::string filename;
 | 
			
		||||
        std::cout << "Please enter name of the data file:\t";
 | 
			
		||||
        std::cin >> filename;
 | 
			
		||||
        if (filename.empty()) {
 | 
			
		||||
            LogError("ARFF/Setup", "No data filename provided, exiting...");
 | 
			
		||||
            exit(1);
 | 
			
		||||
        }
 | 
			
		||||
        std::cout << std::endl;
 | 
			
		||||
        return filename;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void LogInfo(const std::string location, const std::string message) {
 | 
			
		||||
        if (!isVerbose) { return; }
 | 
			
		||||
        std::cout << '[' << location << " - INFO] ";
 | 
			
		||||
        std::cout << message << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void LogError(const std::string location, const std::string message) {
 | 
			
		||||
        if (!isVerbose) { return; }
 | 
			
		||||
        std::cerr << '[' << location << " - ERROR] ";
 | 
			
		||||
        std::cerr << message << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    AttributeType::AttributeType(std::string attribute) {
 | 
			
		||||
        this->attribute = attribute;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void AttributeType::AddValue(std::string value) {
 | 
			
		||||
        values.emplace_back(value);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Instance::Instance(const int id, const int size) {
 | 
			
		||||
        this->id = id;
 | 
			
		||||
        this->values.resize(size);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Read entire data file and parse it
 | 
			
		||||
    void Arff::Read(std::string filename) {
 | 
			
		||||
        std::ifstream dataFile(filename);
 | 
			
		||||
        if (!dataFile.is_open()) {
 | 
			
		||||
            LogError("ARFF/Read", "Unable to open file with name `" 
 | 
			
		||||
                    + filename + ", exiting...");
 | 
			
		||||
            exit(1);
 | 
			
		||||
        }
 | 
			
		||||
        std::string line;
 | 
			
		||||
        while (std::getline(dataFile, line)) {
 | 
			
		||||
            if (line.size() == 1) { continue; }
 | 
			
		||||
            switch (line.at(0)) {
 | 
			
		||||
                case '%':
 | 
			
		||||
                    // Comment line in data
 | 
			
		||||
                    continue;
 | 
			
		||||
                    break;
 | 
			
		||||
                case '@':
 | 
			
		||||
                    AddAttribute(line);
 | 
			
		||||
                    break;
 | 
			
		||||
                default:
 | 
			
		||||
                    AddData(line);
 | 
			
		||||
                    break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        TestIntegrity();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void Arff::Print(void) {
 | 
			
		||||
        std::cout << attributeList.size() << " attributes\n";
 | 
			
		||||
        std::cout << database.size() << " examples\n";
 | 
			
		||||
        std::cout << std::endl;
 | 
			
		||||
 | 
			
		||||
        std::cout << "Attribute (#): values\n";
 | 
			
		||||
        for (AttributeType type : attributeList) {
 | 
			
		||||
            std::cout << type.attribute << " (" << type.values.size() << "):";
 | 
			
		||||
            for (std::string value : type.values) {
 | 
			
		||||
                std::cout << " " << value;
 | 
			
		||||
            }
 | 
			
		||||
            std::cout << '\n';
 | 
			
		||||
        }
 | 
			
		||||
        std::cout << std::endl;
 | 
			
		||||
 | 
			
		||||
        std::cout << relation << '\n';
 | 
			
		||||
        for (Instance instance: database) {
 | 
			
		||||
            for (std::string value : instance.values) {
 | 
			
		||||
                std::cout << '\t' << value;
 | 
			
		||||
            }
 | 
			
		||||
            std::cout << '\n';
 | 
			
		||||
        }
 | 
			
		||||
        std::cout << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Add the attribute to the list
 | 
			
		||||
    void Arff::AddAttribute(std::string line) {
 | 
			
		||||
        std::stringstream parser(line);
 | 
			
		||||
        std::string token;
 | 
			
		||||
        parser >> token;
 | 
			
		||||
        // Signifies beginning of data
 | 
			
		||||
        // Might add a boolean later to mark this
 | 
			
		||||
        if (token == "@data") {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
        if (token == "@relation") {
 | 
			
		||||
            parser >> token;
 | 
			
		||||
            relation = token;
 | 
			
		||||
            LogInfo("ARFF/Attribute", "Relation set: " + relation);
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
        parser >> token;
 | 
			
		||||
        attributeList.emplace_back(token);
 | 
			
		||||
        LogInfo("ARFF/Attribute", "Added attribute: " + token);
 | 
			
		||||
        while (parser >> token) {
 | 
			
		||||
            // Clean token from outside pieces
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), '{'), token.end());
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), '}'), token.end());
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), ','), token.end());
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
 | 
			
		||||
            attributeList.back().AddValue(token);
 | 
			
		||||
            LogInfo("ARFF/Attribute", "Added value: " + token);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Add data to runtime database
 | 
			
		||||
    void Arff::AddData(std::string line) {
 | 
			
		||||
        std::istringstream parser(line);
 | 
			
		||||
        std::string token;
 | 
			
		||||
        int id = 0;
 | 
			
		||||
        if (!database.empty()) { id = database.back().id + 1; }
 | 
			
		||||
        database.emplace_back(id, attributeList.size());
 | 
			
		||||
        LogInfo("ARFF/Data", "Added id: " + std::to_string(database.back().id));
 | 
			
		||||
        for (int i = 0; i < attributeList.size(); ++i) {
 | 
			
		||||
            std::getline(parser, token, ',');
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
 | 
			
		||||
            token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
 | 
			
		||||
            database.back().values.at(i) = token;
 | 
			
		||||
            LogInfo("ARFF/Data", "Added instance value: " + token);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void Arff::TestIntegrity(void) {
 | 
			
		||||
        for (Instance instance : database) {
 | 
			
		||||
            int successCheck = 0;
 | 
			
		||||
            for (int i = 0; i < attributeList.size(); ++i) {
 | 
			
		||||
                LogInfo("ARFF/Integrity", "Instance value tested: '" 
 | 
			
		||||
                        + instance.values.at(i) + "'");
 | 
			
		||||
                for (std::string value : attributeList.at(i).values) {
 | 
			
		||||
                    LogInfo("ARFF/Integrity", "attributeList value: '" 
 | 
			
		||||
                            + value + "'");
 | 
			
		||||
                    if (instance.values.at(i) == value) {
 | 
			
		||||
                        LogInfo("ARFF/Integrity", "Value found: " + value);
 | 
			
		||||
                        ++successCheck;
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            if (successCheck != attributeList.size()) {
 | 
			
		||||
                LogError("ARFF/Integrity", "Value size mismatch: "
 | 
			
		||||
                        + std::to_string(successCheck) + " out of " 
 | 
			
		||||
                        + std::to_string(attributeList.size()));
 | 
			
		||||
                exit(1);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        LogInfo("ARFF/Integrity", "All values exist, continuing...");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										44
									
								
								src/arff/arff.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								src/arff/arff.hpp
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,44 @@
 | 
			
		||||
#ifndef ARFF_HPP
 | 
			
		||||
#define ARFF_HPP
 | 
			
		||||
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
namespace ARFF {
 | 
			
		||||
    void ParseArguments(int argc, char* argv[]);
 | 
			
		||||
    std::string GetDataFilename(void);
 | 
			
		||||
    void LogInfo(const std::string location, const std::string message);
 | 
			
		||||
    void LogError(const std::string location, const std::string message);
 | 
			
		||||
 | 
			
		||||
    struct AttributeType {
 | 
			
		||||
    public:
 | 
			
		||||
        std::string attribute;
 | 
			
		||||
        std::vector<std::string> values;
 | 
			
		||||
        AttributeType(std::string attribute);
 | 
			
		||||
        void AddValue(std::string value);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    struct Instance {
 | 
			
		||||
    public:
 | 
			
		||||
        Instance(const int id, const int size);
 | 
			
		||||
        unsigned int id;
 | 
			
		||||
        std::vector<std::string> values;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    class Arff {
 | 
			
		||||
    public:
 | 
			
		||||
        Arff() = default;
 | 
			
		||||
        void Read(std::string filename);
 | 
			
		||||
        void Print(void);
 | 
			
		||||
    private:
 | 
			
		||||
        std::string relation;
 | 
			
		||||
        std::vector<AttributeType> attributeList;
 | 
			
		||||
        std::vector<Instance> database;
 | 
			
		||||
        void AddAttribute(std::string line);
 | 
			
		||||
        void AddData(std::string line);
 | 
			
		||||
        void TestIntegrity(void);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										13
									
								
								src/arff/main.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								src/arff/main.cpp
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,13 @@
 | 
			
		||||
/*
 | 
			
		||||
 * Author: Gregory Crawford
 | 
			
		||||
 * Date: 2024-03-18
 | 
			
		||||
 * Description: Read and store ARFF data from a file
 | 
			
		||||
 */
 | 
			
		||||
#include "arff.hpp"
 | 
			
		||||
 | 
			
		||||
int main(int argc, char* argv[]) {
 | 
			
		||||
    ARFF::ParseArguments(argc, argv);
 | 
			
		||||
    ARFF::Arff data;
 | 
			
		||||
    data.Read(ARFF::GetDataFilename());
 | 
			
		||||
    data.Print();
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user