add oneR to main #2

Merged
Trianta merged 8 commits from oneR into main 2024-04-17 21:35:56 -05:00
11 changed files with 233 additions and 34 deletions

3
.gitignore vendored
View File

@ -34,3 +34,6 @@
# Build directory # Build directory
build build
# Clang files
.cache

View File

@ -1,5 +1,12 @@
# arff-mining # arff-mining
## Author, Date, Description
- Gregory Crawford
- Date: 2024-04-17
- Description: Read data from ARFF file and generate OneR output
- Performance summary at the bottom of this README
## Compiling the project ## Compiling the project
Prerequisites Prerequisites
@ -10,6 +17,7 @@ In order to compile the project, simply run these two commands:
cmake -B build -S . cmake -B build -S .
cmake --build build cmake --build build
## Running the Project ## Running the Project
The programs should now be compiled at ./build/bin/ The programs should now be compiled at ./build/bin/
@ -18,3 +26,52 @@ ARFF:
```plain ```plain
build/bin/arff build/bin/arff
``` ```
One Rule:
```plain
build/bin/onerule
```
## Performance Summary
***contactLenses***
```
tear-prod-rate:
normal ---> soft
reduced ---> none
Error rate: 7/24
build/bin/onerule 0.00s user 0.00s system 85% cpu 0.002 total
```
***restaurants***
```
Pat:
Full ---> No
None ---> No
Some ---> Yes
Error rate: 2/12
build/bin/onerule 0.00s user 0.00s system 85% cpu 0.001 total
```
***soybean***
```
fruit-spots:
absent ---> alternarialeaf-spot
brown-w/blk-specks ---> anthracnose
colored ---> frog-eye-leaf-spot
distort ---> alternarialeaf-spot
dna ---> brown-stem-rot
Error rate: 385/683
build/bin/onerule 0.02s user 0.00s system 98% cpu 0.024 total
```
***weather***
```
outlook:
overcast ---> yes
rainy ---> yes
sunny ---> no
Error rate: 4/14
build/bin/onerule 0.00s user 0.00s system 87% cpu 0.001 total
```

View File

@ -1 +1,2 @@
add_subdirectory(arff) add_subdirectory(arff)
add_subdirectory(onerule)

View File

@ -1,5 +1,6 @@
add_executable(arff add_executable(arff
./main.cpp main.cpp
./arff.cpp arff.cpp
log.cpp
) )

View File

@ -1,18 +1,17 @@
#include "arff.hpp" #include "arff.hpp"
#include "log.hpp"
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include <algorithm> #include <algorithm>
namespace ARFF { namespace ARFF {
bool isVerbose = false;
void ParseArguments(int argc, char* argv[]) { void ParseArguments(int argc, char* argv[]) {
std::string argument_string; std::string argument_string;
for (int i = 0; i < argc; ++i) { for (int i = 0; i < argc; ++i) {
argument_string.assign(argv[i]); argument_string.assign(argv[i]);
if (argument_string == "-v" || argument_string == "--verbose") { if (argument_string == "-v" || argument_string == "--verbose") {
isVerbose = true; debug::verbose = true;
} }
} }
} }
@ -22,25 +21,13 @@ namespace ARFF {
std::cout << "Please enter name of the data file:\t"; std::cout << "Please enter name of the data file:\t";
std::cin >> filename; std::cin >> filename;
if (filename.empty()) { if (filename.empty()) {
LogError("ARFF/Setup", "No data filename provided, exiting..."); debug::Log(kError, "No data filename provided, exiting...");
exit(1); exit(1);
} }
std::cout << std::endl; std::cout << std::endl;
return filename; return filename;
} }
void LogInfo(const std::string location, const std::string message) {
if (!isVerbose) { return; }
std::cout << '[' << location << " - INFO] ";
std::cout << message << std::endl;
}
void LogError(const std::string location, const std::string message) {
if (!isVerbose) { return; }
std::cerr << '[' << location << " - ERROR] ";
std::cerr << message << std::endl;
}
AttributeType::AttributeType(std::string attribute) { AttributeType::AttributeType(std::string attribute) {
this->attribute = attribute; this->attribute = attribute;
} }
@ -54,11 +41,15 @@ namespace ARFF {
this->values.resize(size); this->values.resize(size);
} }
AttributeEvaluation::AttributeEvaluation(AttributeType *attribute) {
this->currentAttribute = attribute;
}
// Read entire data file and parse it // Read entire data file and parse it
void Arff::Read(std::string filename) { void Arff::Read(std::string filename) {
std::ifstream dataFile(filename); std::ifstream dataFile(filename);
if (!dataFile.is_open()) { if (!dataFile.is_open()) {
LogError("ARFF/Read", "Unable to open file with name `" debug::Log(kError, "Unable to open file with name `"
+ filename + ", exiting..."); + filename + ", exiting...");
exit(1); exit(1);
} }
@ -81,11 +72,16 @@ namespace ARFF {
TestIntegrity(); TestIntegrity();
} }
void Arff::Print(void) { // Print generic data information
// Number of attributes and size of database
void Arff::PrintOverview(void) {
std::cout << attributeList.size() << " attributes\n"; std::cout << attributeList.size() << " attributes\n";
std::cout << database.size() << " examples\n"; std::cout << database.size() << " examples\n";
std::cout << std::endl; std::cout << std::endl;
}
// Print full data information
void Arff::PrintData(void) {
std::cout << "Attribute (#): values\n"; std::cout << "Attribute (#): values\n";
for (AttributeType type : attributeList) { for (AttributeType type : attributeList) {
std::cout << type.attribute << " (" << type.values.size() << "):"; std::cout << type.attribute << " (" << type.values.size() << "):";
@ -107,6 +103,19 @@ namespace ARFF {
std::cout << std::endl; std::cout << std::endl;
} }
// Print result of applying OneR
// TODO: Create function
void Arff::OneR(void) {
AttributeEvaluation bestAttribute = _OneR();
debug::Log(kNone, "***Best 1-rule***");
debug::Log(kNone, "\t" + bestAttribute.currentAttribute->attribute + ':');
for (auto it : bestAttribute.rules) {
debug::Log(kNone, "\t\t" + it.first + " ---> " + it.second);
}
debug::Log(kNone, "Error rate: " + std::to_string(bestAttribute.totalError) + "/" + std::to_string(database.size()));
}
// Add the attribute to the list // Add the attribute to the list
void Arff::AddAttribute(std::string line) { void Arff::AddAttribute(std::string line) {
std::stringstream parser(line); std::stringstream parser(line);
@ -120,12 +129,12 @@ namespace ARFF {
if (token == "@relation" || token == "@RELATION") { if (token == "@relation" || token == "@RELATION") {
parser >> token; parser >> token;
relation = token; relation = token;
LogInfo("ARFF/Attribute", "Relation set: " + relation); debug::Log(kLog, "Relation set: " + relation);
return; return;
} }
parser >> token; parser >> token;
attributeList.emplace_back(token); attributeList.emplace_back(token);
LogInfo("ARFF/Attribute", "Added attribute: " + token); debug::Log(kLog, "Added attribute: " + token);
while (std::getline(parser, token, ',')) { while (std::getline(parser, token, ',')) {
// Clean token from outside pieces // Clean token from outside pieces
token.erase(std::remove(token.begin(), token.end(), ' '), token.end()); token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
@ -136,7 +145,7 @@ namespace ARFF {
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
attributeList.back().AddValue(token); attributeList.back().AddValue(token);
LogInfo("ARFF/Attribute", "Added value: " + token); debug::Log(kLog, "Added value: " + token);
} }
// Additional missing value case // Additional missing value case
attributeList.back().AddValue("?"); attributeList.back().AddValue("?");
@ -149,14 +158,14 @@ namespace ARFF {
int id = 0; int id = 0;
if (!database.empty()) { id = database.back().id + 1; } if (!database.empty()) { id = database.back().id + 1; }
database.emplace_back(id, attributeList.size()); database.emplace_back(id, attributeList.size());
LogInfo("ARFF/Data", "Added id: " + std::to_string(database.back().id)); debug::Log(kLog, "Added id: " + std::to_string(database.back().id));
for (int i = 0; i < attributeList.size(); ++i) { for (int i = 0; i < attributeList.size(); ++i) {
std::getline(parser, token, ','); std::getline(parser, token, ',');
token.erase(std::remove(token.begin(), token.end(), ' '), token.end()); token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
database.back().values.at(i) = token; database.back().values.at(i) = token;
LogInfo("ARFF/Data", "Added instance value: " + token); debug::Log(kLog, "Added instance value: " + token);
} }
} }
@ -164,25 +173,81 @@ namespace ARFF {
for (Instance instance : database) { for (Instance instance : database) {
int successCheck = 0; int successCheck = 0;
for (int i = 0; i < attributeList.size(); ++i) { for (int i = 0; i < attributeList.size(); ++i) {
LogInfo("ARFF/Integrity", "Instance value tested: '" debug::Log(kTrace, "Instance value tested: '"
+ instance.values.at(i) + "'"); + instance.values.at(i) + "'");
for (std::string value : attributeList.at(i).values) { for (std::string value : attributeList.at(i).values) {
LogInfo("ARFF/Integrity", "attributeList value: '" debug::Log(kTrace, "attributeList value: '"
+ value + "'"); + value + "'");
if (instance.values.at(i) == value) { if (instance.values.at(i) == value) {
LogInfo("ARFF/Integrity", "Value found: " + value); debug::Log(kTrace, "Value found: " + value);
++successCheck; ++successCheck;
break; break;
} }
} }
} }
if (successCheck != attributeList.size()) { if (successCheck != attributeList.size()) {
LogError("ARFF/Integrity", "Value size mismatch: " debug::Log(kError, "Value size mismatch: "
+ std::to_string(successCheck) + " out of " + std::to_string(successCheck) + " out of "
+ std::to_string(attributeList.size())); + std::to_string(attributeList.size()));
exit(1); exit(1);
} }
} }
LogInfo("ARFF/Integrity", "All values exist, continuing..."); debug::Log(kLog, "All values exist, continuing...");
}
// Perform OneR on data that was previously read in
AttributeEvaluation Arff::_OneR(void) {
AttributeEvaluation bestEvaluation;
bestEvaluation.totalErrorRate = 1.0f;
// -1 used for ignoring test rule (eg, play=yes/no)
for (int i = 0; i < attributeList.size() - 1; ++i) {
AttributeEvaluation evaluation = EvaluateAttribute(&attributeList[i], i);
if (evaluation.totalErrorRate < bestEvaluation.totalErrorRate) {
bestEvaluation = evaluation;
bestEvaluation.currentAttribute = evaluation.currentAttribute;
}
debug::Log(kLog, "Evaluation on " + evaluation.currentAttribute->attribute + " completed");
}
return bestEvaluation;
}
// Determine error rate and best option for each value of an attribute
// Originally set up to use OneR
AttributeEvaluation Arff::EvaluateAttribute(AttributeType *attribute, const int attributePos) {
AttributeEvaluation evaluation(attribute);
std::map<std::string, int> results;
for (std::string value : attributeList.end()->values) {
results.emplace(value, 0);
}
for (int i = 0; i < attribute->values.size(); ++i) {
int resultsTotal = 0;
if (attribute->values[i] == "?") { continue; }
for (auto instance = database.begin(); instance != database.end(); ++instance) {
if (instance->values[attributePos] != attribute->values[i]) { continue; }
++results[instance->values.back()];
resultsTotal++;
}
debug::Log(kTrace, "Results:");
for (auto it : results) { debug::Log(kTrace, "\t" + it.first + ": " + std::to_string(it.second)); }
float lowest;
float lowestRate = 1.0f;
std::string bestResult = results.begin()->first;
for (auto it = results.begin(); it != results.end(); ++it) {
if (((resultsTotal - it->second) / float(resultsTotal)) < lowestRate) {
lowestRate = ((resultsTotal - it->second) / float(resultsTotal));
lowest = resultsTotal - it->second;
bestResult = it->first;
}
}
evaluation.rules.emplace(attribute->values[i], bestResult);
evaluation.totalError += lowest;
debug::Log(kLog, "Added rule " + attribute->values[i] + "->" + bestResult);
// Reset
for (auto it = results.begin(); it != results.end(); ++it) {
it->second = 0;
}
}
evaluation.totalErrorRate = evaluation.totalError / float(database.size());
return evaluation;
} }
} }

View File

@ -3,12 +3,11 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <map>
namespace ARFF { namespace ARFF {
void ParseArguments(int argc, char* argv[]); void ParseArguments(int argc, char* argv[]);
std::string GetDataFilename(void); std::string GetDataFilename(void);
void LogInfo(const std::string location, const std::string message);
void LogError(const std::string location, const std::string message);
struct AttributeType { struct AttributeType {
public: public:
@ -25,11 +24,22 @@ namespace ARFF {
std::vector<std::string> values; std::vector<std::string> values;
}; };
struct AttributeEvaluation {
AttributeEvaluation() = default;
AttributeEvaluation(AttributeType *attribute);
AttributeType *currentAttribute;
std::map<std::string, std::string> rules;
float totalErrorRate = 0.0f;
int totalError = 0;
};
class Arff { class Arff {
public: public:
Arff() = default; Arff() = default;
void Read(std::string filename); void Read(std::string filename);
void Print(void); void PrintOverview(void);
void PrintData(void);
void OneR(void);
private: private:
std::string relation; std::string relation;
std::vector<AttributeType> attributeList; std::vector<AttributeType> attributeList;
@ -37,6 +47,8 @@ namespace ARFF {
void AddAttribute(std::string line); void AddAttribute(std::string line);
void AddData(std::string line); void AddData(std::string line);
void TestIntegrity(void); void TestIntegrity(void);
AttributeEvaluation _OneR(void);
AttributeEvaluation EvaluateAttribute(AttributeType *attribute, const int attributePos);
}; };
} }

19
src/arff/log.cpp Normal file
View File

@ -0,0 +1,19 @@
#include "log.hpp"
#include <iostream>
namespace debug {
bool verbose = false;
void Log(LogLevel level, std::string message) {
std::string logMessage = "";
if (!verbose && level > kNone) { return; }
switch (level) {
case kLog: logMessage += "[LOG] "; break;
case kWarn: logMessage += "[WARN] "; break;
case kError: logMessage += "[ERROR] "; break;
case kTrace: logMessage += "[TRACE] "; break;
}
logMessage += message;
std::cout << logMessage << std::endl;
}
}

19
src/arff/log.hpp Normal file
View File

@ -0,0 +1,19 @@
#ifndef LOG_HPP
#define LOG_HPP
#include <string>
enum LogLevel {
kNone = -1,
kLog = 0,
kWarn,
kError,
kTrace
};
namespace debug {
extern bool verbose;
void Log(LogLevel level, std::string message);
}
#endif

View File

@ -9,5 +9,6 @@ int main(int argc, char* argv[]) {
ARFF::ParseArguments(argc, argv); ARFF::ParseArguments(argc, argv);
ARFF::Arff data; ARFF::Arff data;
data.Read(ARFF::GetDataFilename()); data.Read(ARFF::GetDataFilename());
data.Print(); data.PrintOverview();
data.PrintData();
} }

View File

@ -0,0 +1,7 @@
include_directories(../arff)
add_executable(onerule
main.cpp
../arff/arff.cpp
../arff/log.cpp
)

14
src/onerule/main.cpp Normal file
View File

@ -0,0 +1,14 @@
/*
* Author: Gregory Crawford
* Date: 2024-03-18
* Description: Read data from ARFF file and generate OneR output
*/
#include "arff.hpp"
int main(int argc, char* argv[]) {
ARFF::ParseArguments(argc, argv);
ARFF::Arff data;
data.Read(ARFF::GetDataFilename());
data.PrintOverview();
data.OneR();
}