add oneR to main #2
3
.gitignore
vendored
3
.gitignore
vendored
@ -34,3 +34,6 @@
|
||||
|
||||
# Build directory
|
||||
build
|
||||
|
||||
# Clang files
|
||||
.cache
|
||||
|
57
README.md
57
README.md
@ -1,5 +1,12 @@
|
||||
# arff-mining
|
||||
|
||||
## Author, Date, Description
|
||||
- Gregory Crawford
|
||||
- Date: 2024-04-17
|
||||
- Description: Read data from ARFF file and generate OneR output
|
||||
- Performance summary at the bottom of this README
|
||||
|
||||
|
||||
## Compiling the project
|
||||
|
||||
Prerequisites
|
||||
@ -10,6 +17,7 @@ In order to compile the project, simply run these two commands:
|
||||
cmake -B build -S .
|
||||
cmake --build build
|
||||
|
||||
|
||||
## Running the Project
|
||||
|
||||
The programs should now be compiled at ./build/bin/
|
||||
@ -18,3 +26,52 @@ ARFF:
|
||||
```plain
|
||||
build/bin/arff
|
||||
```
|
||||
|
||||
One Rule:
|
||||
```plain
|
||||
build/bin/onerule
|
||||
```
|
||||
|
||||
|
||||
## Performance Summary
|
||||
|
||||
***contactLenses***
|
||||
```
|
||||
tear-prod-rate:
|
||||
normal ---> soft
|
||||
reduced ---> none
|
||||
Error rate: 7/24
|
||||
build/bin/onerule 0.00s user 0.00s system 85% cpu 0.002 total
|
||||
```
|
||||
|
||||
***restaurants***
|
||||
```
|
||||
Pat:
|
||||
Full ---> No
|
||||
None ---> No
|
||||
Some ---> Yes
|
||||
Error rate: 2/12
|
||||
build/bin/onerule 0.00s user 0.00s system 85% cpu 0.001 total
|
||||
```
|
||||
|
||||
***soybean***
|
||||
```
|
||||
fruit-spots:
|
||||
absent ---> alternarialeaf-spot
|
||||
brown-w/blk-specks ---> anthracnose
|
||||
colored ---> frog-eye-leaf-spot
|
||||
distort ---> alternarialeaf-spot
|
||||
dna ---> brown-stem-rot
|
||||
Error rate: 385/683
|
||||
build/bin/onerule 0.02s user 0.00s system 98% cpu 0.024 total
|
||||
```
|
||||
|
||||
***weather***
|
||||
```
|
||||
outlook:
|
||||
overcast ---> yes
|
||||
rainy ---> yes
|
||||
sunny ---> no
|
||||
Error rate: 4/14
|
||||
build/bin/onerule 0.00s user 0.00s system 87% cpu 0.001 total
|
||||
```
|
||||
|
@ -1 +1,2 @@
|
||||
add_subdirectory(arff)
|
||||
add_subdirectory(onerule)
|
||||
|
@ -1,5 +1,6 @@
|
||||
add_executable(arff
|
||||
./main.cpp
|
||||
./arff.cpp
|
||||
main.cpp
|
||||
arff.cpp
|
||||
log.cpp
|
||||
)
|
||||
|
||||
|
@ -1,18 +1,17 @@
|
||||
#include "arff.hpp"
|
||||
#include "log.hpp"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
|
||||
namespace ARFF {
|
||||
bool isVerbose = false;
|
||||
|
||||
void ParseArguments(int argc, char* argv[]) {
|
||||
std::string argument_string;
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
argument_string.assign(argv[i]);
|
||||
if (argument_string == "-v" || argument_string == "--verbose") {
|
||||
isVerbose = true;
|
||||
debug::verbose = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -22,25 +21,13 @@ namespace ARFF {
|
||||
std::cout << "Please enter name of the data file:\t";
|
||||
std::cin >> filename;
|
||||
if (filename.empty()) {
|
||||
LogError("ARFF/Setup", "No data filename provided, exiting...");
|
||||
debug::Log(kError, "No data filename provided, exiting...");
|
||||
exit(1);
|
||||
}
|
||||
std::cout << std::endl;
|
||||
return filename;
|
||||
}
|
||||
|
||||
void LogInfo(const std::string location, const std::string message) {
|
||||
if (!isVerbose) { return; }
|
||||
std::cout << '[' << location << " - INFO] ";
|
||||
std::cout << message << std::endl;
|
||||
}
|
||||
|
||||
void LogError(const std::string location, const std::string message) {
|
||||
if (!isVerbose) { return; }
|
||||
std::cerr << '[' << location << " - ERROR] ";
|
||||
std::cerr << message << std::endl;
|
||||
}
|
||||
|
||||
AttributeType::AttributeType(std::string attribute) {
|
||||
this->attribute = attribute;
|
||||
}
|
||||
@ -54,11 +41,15 @@ namespace ARFF {
|
||||
this->values.resize(size);
|
||||
}
|
||||
|
||||
AttributeEvaluation::AttributeEvaluation(AttributeType *attribute) {
|
||||
this->currentAttribute = attribute;
|
||||
}
|
||||
|
||||
// Read entire data file and parse it
|
||||
void Arff::Read(std::string filename) {
|
||||
std::ifstream dataFile(filename);
|
||||
if (!dataFile.is_open()) {
|
||||
LogError("ARFF/Read", "Unable to open file with name `"
|
||||
debug::Log(kError, "Unable to open file with name `"
|
||||
+ filename + ", exiting...");
|
||||
exit(1);
|
||||
}
|
||||
@ -81,11 +72,16 @@ namespace ARFF {
|
||||
TestIntegrity();
|
||||
}
|
||||
|
||||
void Arff::Print(void) {
|
||||
// Print generic data information
|
||||
// Number of attributes and size of database
|
||||
void Arff::PrintOverview(void) {
|
||||
std::cout << attributeList.size() << " attributes\n";
|
||||
std::cout << database.size() << " examples\n";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Print full data information
|
||||
void Arff::PrintData(void) {
|
||||
std::cout << "Attribute (#): values\n";
|
||||
for (AttributeType type : attributeList) {
|
||||
std::cout << type.attribute << " (" << type.values.size() << "):";
|
||||
@ -107,6 +103,19 @@ namespace ARFF {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Print result of applying OneR
|
||||
// TODO: Create function
|
||||
void Arff::OneR(void) {
|
||||
AttributeEvaluation bestAttribute = _OneR();
|
||||
debug::Log(kNone, "***Best 1-rule***");
|
||||
debug::Log(kNone, "\t" + bestAttribute.currentAttribute->attribute + ':');
|
||||
for (auto it : bestAttribute.rules) {
|
||||
debug::Log(kNone, "\t\t" + it.first + " ---> " + it.second);
|
||||
}
|
||||
debug::Log(kNone, "Error rate: " + std::to_string(bestAttribute.totalError) + "/" + std::to_string(database.size()));
|
||||
}
|
||||
|
||||
|
||||
// Add the attribute to the list
|
||||
void Arff::AddAttribute(std::string line) {
|
||||
std::stringstream parser(line);
|
||||
@ -120,12 +129,12 @@ namespace ARFF {
|
||||
if (token == "@relation" || token == "@RELATION") {
|
||||
parser >> token;
|
||||
relation = token;
|
||||
LogInfo("ARFF/Attribute", "Relation set: " + relation);
|
||||
debug::Log(kLog, "Relation set: " + relation);
|
||||
return;
|
||||
}
|
||||
parser >> token;
|
||||
attributeList.emplace_back(token);
|
||||
LogInfo("ARFF/Attribute", "Added attribute: " + token);
|
||||
debug::Log(kLog, "Added attribute: " + token);
|
||||
while (std::getline(parser, token, ',')) {
|
||||
// Clean token from outside pieces
|
||||
token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
|
||||
@ -136,7 +145,7 @@ namespace ARFF {
|
||||
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
|
||||
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
|
||||
attributeList.back().AddValue(token);
|
||||
LogInfo("ARFF/Attribute", "Added value: " + token);
|
||||
debug::Log(kLog, "Added value: " + token);
|
||||
}
|
||||
// Additional missing value case
|
||||
attributeList.back().AddValue("?");
|
||||
@ -149,14 +158,14 @@ namespace ARFF {
|
||||
int id = 0;
|
||||
if (!database.empty()) { id = database.back().id + 1; }
|
||||
database.emplace_back(id, attributeList.size());
|
||||
LogInfo("ARFF/Data", "Added id: " + std::to_string(database.back().id));
|
||||
debug::Log(kLog, "Added id: " + std::to_string(database.back().id));
|
||||
for (int i = 0; i < attributeList.size(); ++i) {
|
||||
std::getline(parser, token, ',');
|
||||
token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
|
||||
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
|
||||
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
|
||||
database.back().values.at(i) = token;
|
||||
LogInfo("ARFF/Data", "Added instance value: " + token);
|
||||
debug::Log(kLog, "Added instance value: " + token);
|
||||
}
|
||||
}
|
||||
|
||||
@ -164,25 +173,81 @@ namespace ARFF {
|
||||
for (Instance instance : database) {
|
||||
int successCheck = 0;
|
||||
for (int i = 0; i < attributeList.size(); ++i) {
|
||||
LogInfo("ARFF/Integrity", "Instance value tested: '"
|
||||
debug::Log(kTrace, "Instance value tested: '"
|
||||
+ instance.values.at(i) + "'");
|
||||
for (std::string value : attributeList.at(i).values) {
|
||||
LogInfo("ARFF/Integrity", "attributeList value: '"
|
||||
debug::Log(kTrace, "attributeList value: '"
|
||||
+ value + "'");
|
||||
if (instance.values.at(i) == value) {
|
||||
LogInfo("ARFF/Integrity", "Value found: " + value);
|
||||
debug::Log(kTrace, "Value found: " + value);
|
||||
++successCheck;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (successCheck != attributeList.size()) {
|
||||
LogError("ARFF/Integrity", "Value size mismatch: "
|
||||
debug::Log(kError, "Value size mismatch: "
|
||||
+ std::to_string(successCheck) + " out of "
|
||||
+ std::to_string(attributeList.size()));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
LogInfo("ARFF/Integrity", "All values exist, continuing...");
|
||||
debug::Log(kLog, "All values exist, continuing...");
|
||||
}
|
||||
|
||||
// Perform OneR on data that was previously read in
|
||||
AttributeEvaluation Arff::_OneR(void) {
|
||||
AttributeEvaluation bestEvaluation;
|
||||
bestEvaluation.totalErrorRate = 1.0f;
|
||||
// -1 used for ignoring test rule (eg, play=yes/no)
|
||||
for (int i = 0; i < attributeList.size() - 1; ++i) {
|
||||
AttributeEvaluation evaluation = EvaluateAttribute(&attributeList[i], i);
|
||||
if (evaluation.totalErrorRate < bestEvaluation.totalErrorRate) {
|
||||
bestEvaluation = evaluation;
|
||||
bestEvaluation.currentAttribute = evaluation.currentAttribute;
|
||||
}
|
||||
debug::Log(kLog, "Evaluation on " + evaluation.currentAttribute->attribute + " completed");
|
||||
}
|
||||
return bestEvaluation;
|
||||
}
|
||||
|
||||
// Determine error rate and best option for each value of an attribute
|
||||
// Originally set up to use OneR
|
||||
AttributeEvaluation Arff::EvaluateAttribute(AttributeType *attribute, const int attributePos) {
|
||||
AttributeEvaluation evaluation(attribute);
|
||||
std::map<std::string, int> results;
|
||||
for (std::string value : attributeList.end()->values) {
|
||||
results.emplace(value, 0);
|
||||
}
|
||||
for (int i = 0; i < attribute->values.size(); ++i) {
|
||||
int resultsTotal = 0;
|
||||
if (attribute->values[i] == "?") { continue; }
|
||||
for (auto instance = database.begin(); instance != database.end(); ++instance) {
|
||||
if (instance->values[attributePos] != attribute->values[i]) { continue; }
|
||||
++results[instance->values.back()];
|
||||
resultsTotal++;
|
||||
}
|
||||
debug::Log(kTrace, "Results:");
|
||||
for (auto it : results) { debug::Log(kTrace, "\t" + it.first + ": " + std::to_string(it.second)); }
|
||||
float lowest;
|
||||
float lowestRate = 1.0f;
|
||||
std::string bestResult = results.begin()->first;
|
||||
for (auto it = results.begin(); it != results.end(); ++it) {
|
||||
if (((resultsTotal - it->second) / float(resultsTotal)) < lowestRate) {
|
||||
lowestRate = ((resultsTotal - it->second) / float(resultsTotal));
|
||||
lowest = resultsTotal - it->second;
|
||||
bestResult = it->first;
|
||||
}
|
||||
}
|
||||
evaluation.rules.emplace(attribute->values[i], bestResult);
|
||||
evaluation.totalError += lowest;
|
||||
debug::Log(kLog, "Added rule " + attribute->values[i] + "->" + bestResult);
|
||||
// Reset
|
||||
for (auto it = results.begin(); it != results.end(); ++it) {
|
||||
it->second = 0;
|
||||
}
|
||||
}
|
||||
evaluation.totalErrorRate = evaluation.totalError / float(database.size());
|
||||
return evaluation;
|
||||
}
|
||||
}
|
||||
|
@ -3,12 +3,11 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
namespace ARFF {
|
||||
void ParseArguments(int argc, char* argv[]);
|
||||
std::string GetDataFilename(void);
|
||||
void LogInfo(const std::string location, const std::string message);
|
||||
void LogError(const std::string location, const std::string message);
|
||||
|
||||
struct AttributeType {
|
||||
public:
|
||||
@ -25,11 +24,22 @@ namespace ARFF {
|
||||
std::vector<std::string> values;
|
||||
};
|
||||
|
||||
struct AttributeEvaluation {
|
||||
AttributeEvaluation() = default;
|
||||
AttributeEvaluation(AttributeType *attribute);
|
||||
AttributeType *currentAttribute;
|
||||
std::map<std::string, std::string> rules;
|
||||
float totalErrorRate = 0.0f;
|
||||
int totalError = 0;
|
||||
};
|
||||
|
||||
class Arff {
|
||||
public:
|
||||
Arff() = default;
|
||||
void Read(std::string filename);
|
||||
void Print(void);
|
||||
void PrintOverview(void);
|
||||
void PrintData(void);
|
||||
void OneR(void);
|
||||
private:
|
||||
std::string relation;
|
||||
std::vector<AttributeType> attributeList;
|
||||
@ -37,6 +47,8 @@ namespace ARFF {
|
||||
void AddAttribute(std::string line);
|
||||
void AddData(std::string line);
|
||||
void TestIntegrity(void);
|
||||
AttributeEvaluation _OneR(void);
|
||||
AttributeEvaluation EvaluateAttribute(AttributeType *attribute, const int attributePos);
|
||||
};
|
||||
|
||||
}
|
||||
|
19
src/arff/log.cpp
Normal file
19
src/arff/log.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
#include "log.hpp"
|
||||
#include <iostream>
|
||||
|
||||
namespace debug {
|
||||
bool verbose = false;
|
||||
|
||||
void Log(LogLevel level, std::string message) {
|
||||
std::string logMessage = "";
|
||||
if (!verbose && level > kNone) { return; }
|
||||
switch (level) {
|
||||
case kLog: logMessage += "[LOG] "; break;
|
||||
case kWarn: logMessage += "[WARN] "; break;
|
||||
case kError: logMessage += "[ERROR] "; break;
|
||||
case kTrace: logMessage += "[TRACE] "; break;
|
||||
}
|
||||
logMessage += message;
|
||||
std::cout << logMessage << std::endl;
|
||||
}
|
||||
}
|
19
src/arff/log.hpp
Normal file
19
src/arff/log.hpp
Normal file
@ -0,0 +1,19 @@
|
||||
#ifndef LOG_HPP
|
||||
#define LOG_HPP
|
||||
|
||||
#include <string>
|
||||
|
||||
enum LogLevel {
|
||||
kNone = -1,
|
||||
kLog = 0,
|
||||
kWarn,
|
||||
kError,
|
||||
kTrace
|
||||
};
|
||||
|
||||
namespace debug {
|
||||
extern bool verbose;
|
||||
void Log(LogLevel level, std::string message);
|
||||
}
|
||||
|
||||
#endif
|
@ -9,5 +9,6 @@ int main(int argc, char* argv[]) {
|
||||
ARFF::ParseArguments(argc, argv);
|
||||
ARFF::Arff data;
|
||||
data.Read(ARFF::GetDataFilename());
|
||||
data.Print();
|
||||
data.PrintOverview();
|
||||
data.PrintData();
|
||||
}
|
||||
|
7
src/onerule/CMakeLists.txt
Normal file
7
src/onerule/CMakeLists.txt
Normal file
@ -0,0 +1,7 @@
|
||||
include_directories(../arff)
|
||||
add_executable(onerule
|
||||
main.cpp
|
||||
../arff/arff.cpp
|
||||
../arff/log.cpp
|
||||
)
|
||||
|
14
src/onerule/main.cpp
Normal file
14
src/onerule/main.cpp
Normal file
@ -0,0 +1,14 @@
|
||||
/*
|
||||
* Author: Gregory Crawford
|
||||
* Date: 2024-03-18
|
||||
* Description: Read data from ARFF file and generate OneR output
|
||||
*/
|
||||
#include "arff.hpp"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
ARFF::ParseArguments(argc, argv);
|
||||
ARFF::Arff data;
|
||||
data.Read(ARFF::GetDataFilename());
|
||||
data.PrintOverview();
|
||||
data.OneR();
|
||||
}
|
Loading…
Reference in New Issue
Block a user