/* -------------------------------------------------------------------------- */
/* */
/* CLASSIFICATION */
/* */
/* Frans Coenen */
/* */
/* Wednesday 4 February 2004 */
/* */
/* Department of Computer Science */
/* The University of Liverpool */
/* */
/* -------------------------------------------------------------------------- */
/* Class structure
AssocRuleMining
|
+-- Classification */
// Java packages
import java.util.*;
import java.io.*;
/** Methods to produce classification rules using FOIL style algorithms.
Assumes that input dataset is organised such that classes are at the end of
each record. Note: number of classes value is stored in the numClasses
field.
@author Frans Coenen
@version 29 April 2003 */
public class Classification extends AssocRuleMining {
/* ------ FIELDS ------ */
// Data structures
/** 2-D array to hold the test data set.
Classification involves
producing a set of Classification Rules (CRs) from a training set and then
testing the effectiveness of the CRs on a test set. */
protected short[][] testDataArray = null;
/** 3-D array to hold 10th sets of input data.
Used in conjunction with
"10 Cross Validation" where the input data is divided into 10 subsets and
CRs are produced using each subset in turn and validated against the
remaining 9 sets. The overall average accuracy is then the total accuracy
divided by 10. */
protected short[][][] tenthDataSets = new short[10][][];
/** 1-D data array to hold classifiers (can not call it classes because
this is a reserved word!). */
protected short[] classifiers = null;
/** 2-D attribute array, first row contains '0.0' or '1.0' (0.0=available,
1.0=unavailable), second row contains gain.
Used in FOIL, PRM and
CPAR. */
protected double[][] attributes = null;
/** Temporary 2-D attribute array.
Used in FOIL, PRM and CPAR. */
protected double[][] attributes2 = null;
// Constants
/** The maximum number of rules to be considered per class when classifying
records during testing. Usually set to 5. */
protected final int K_VALUE = 5;
/** The minimum best gain acceptable for the process of the generation of a
single rule to continue.
Used in FOIL, PRM and CPAR.*/
protected final double MIN_BEST_GAIN=0.7;
// Other fields
/** Number of rows in input data set.
Note this is not the same as the
number of rows in the classification training set. Used for temporary
storage of total number of rows when using Ten Cross Validation (TCV)
approach only.
The numRows field inherited from the super class
records is used throughout the CR generation process. Set to number of rows
using setNumRowsInInputSet method called by application class. */
protected int numRowsInInputSet;
/** Number of rows in test set, not the same as the number of rows
in the classification training set. */
protected int numRowsInTestSet;
/** Number of rows in training set, not the same as the number of rows
in the classification test set. */
protected int numRowsInTrainingSet;
/** Instance of the class RuleList */
protected RuleList currentRlist = null;
/* ------ CONSTRUCTORS ------ */
/** Constructor for Classification class which also processes command
line arguments.
@param args the command line arguments (array of String instances). */
public Classification(String[] args) {
super(args);
// Create RuleList object for later usage
currentRlist = new RuleList();
}
/* ------ METHODS ------ */
/** Starts classification rule generation process.
@return The classification accuracy (%). */
public double startClassification() {
/* STUB */
return(0.0); // Default
}
/* ------------------------------------------------------------ */
/* */
/* GAIN CALCULATIONS */
/* */
/* ------------------------------------------------------------ */
/* FOIL, PRM and CPAR all use the concept of "gain" to judge whether the
inclusion of an attribute within a rule "buys" anything. How gain values are
calculated and used depends on the algorithm bit methods included here are
general methods used by all three algorithms. */
/* GET BEST GAIN */
/** Process attributes array to identify the best gain
@return the index of the best attribute, if no available attributes or
will return 0. */
protected int getBestGain() {
int indexOfBestAttribute = 0;
double bestGainSofar = 0.0;
// Loop through attributes2 array
for (int index=1;indexbestGainSofar) {
indexOfBestAttribute = index;
bestGainSofar = attributes2[index][1];
}
}
}
// Return
return(indexOfBestAttribute);
}
/* ---------------------------------------------------------------- */
/* */
/* TEST CLASSIFICATION */
/* */
/* ---------------------------------------------------------------- */
/* TEST CLASSIFICATION */
/** Tests the generated classification rules using test sets and return
percentage accuracy.
@param the percentage accuracy. */
protected double testClassification() {
int correctClassCounter = 0;
int wrongClassCounter = 0;
int unclassifiedCounter = 0;
// Check if test data exists, if not return' 0'
if (testDataArray==null) {
System.out.println("ERROR: No test data");
return(0);
}
// Check if any classification rules have been generated, if not
// return'0'
if (currentRlist.startRulelist==null) {
System.out.println("No classification rules generated!");
return(0);
}
// Loop through test set
int index=0;
for(;index Note: (1) training data set is stored in the dataArray
structure in which the input data was originally stored, (2) method called
from application class not from Classification class constructor
because the input data set may (given a particular application) first
require ordering and possibly also pruning and recasting (see
recastClassifiers method). */
public void createTrainingAndTestDataSets() {
// Determine size of training and test sets.
final double PERCENTAGE_SIZE_OF_TEST_SET = 50.0;
numRowsInTestSet = (int) ((double) numRows*
PERCENTAGE_SIZE_OF_TEST_SET/100.0);
numRowsInTrainingSet = numRows-numRowsInTestSet;
numRows = numRowsInTrainingSet;
// Output
System.out.println("Num. of recs. in training set = " +
numRowsInTrainingSet + "\nNum. of recs. " +
"in test set = " + numRowsInTestSet);
// Dimension and populate training set.
short[][] trainingSet = new short[numRowsInTrainingSet][];
int index1=0;
for (;index1 Note: (1) useing TCV (Ten Cross Validation), (2) training
data set is stored in the dataArray structure in which the input data was
originally stored, in which the initial input data is stored, (3)
this method is not called from the constructor as the input data set may
(given a particular application) first require ordering and possibly also
pruning.
@param testSetIndex the index of the tenths data sets to be used as the
test set. */
private void createTrainingAndTestDataSets(int testSetIndex) {
// Dimension and populate test set.
numRowsInTestSet = tenthDataSets[testSetIndex].length;
testDataArray = tenthDataSets[testSetIndex];
// Dimension of and populate training set.
numRowsInTrainingSet = numRowsInInputSet-numRowsInTestSet;
numRows = numRowsInTrainingSet;
short[][] trainingSet = new short[numRows][];
int trainingSetIndex=0;
// Before test set
for(int index=0;index It is possible,
using PRM or CPAR, to generate two or more rules with identical antecedents
and consequents. This is because records are not removed from the N and P
lists once a rule that satisfies them has been found, but instead these
records simply have their weughting reduced. This reduced weighting forms
part of the algorithm to calculate gains, however it is still possible for
the same attributes to be selected to form the antecedent of a rule because
these attributes (despite the reduce weighting) still produce the best
gain. Eventually the weighting for the effected records is reduced so far
that the attributes do not produce a best gain and are therefore not
selected. Where this occurs the rules with the lower accuracy are removed
from the rule list. */
protected void processRules() {
// Remove duplicates from rule list
removeDuplicateRules();
}
/* REMOVE DIPLICAT RULES */
/** Processes linked list of rules and removes any duplicates. */
private void removeDuplicateRules() {
RuleList.RuleNode linkRuleNode = currentRlist.startRulelist;
// Loop through linked list looking for duplicates
while (linkRuleNode != null) {
removeDuplicate(linkRuleNode,linkRuleNode.antecedent,
linkRuleNode.consequent);
linkRuleNode = linkRuleNode.next;
}
}
/* REMOVE DUPLICATE RULE */
/** Processes linked list of rules and removes any duplicate of the given
rule. */
private void removeDuplicate(RuleList.RuleNode linkRef, short[] antecedent,
short[] consequent) {
RuleList.RuleNode markerNode = linkRef;
RuleList.RuleNode linkRuleNode = linkRef.next;
// Loop through linked list looking for duplicate rule
while (linkRuleNode != null) {
if (isEqual(antecedent,linkRuleNode.antecedent) &&
isEqual(consequent,linkRuleNode.consequent)) {
markerNode.next = linkRuleNode.next;
}
else markerNode = linkRuleNode;
linkRuleNode = linkRuleNode.next;
}
}
/*----------------------------------------------------------------------- */
/* */
/* GET METHODS */
/* */
/*----------------------------------------------------------------------- */
/* GET CURRENT RULE LIST OBJECT */
/** Gets the current instance of the RuleList class.
@return the current RuleList object. */
public RuleList getCurrentRuleListObject() {
return(currentRlist);
}
/* ------------------------------------------------------------ */
/* */
/* MISCELANEOUS */
/* */
/* ------------------------------------------------------------ */
/* COPY DOUBLE 2-D ARRAY */
/** Copies the given 2-D array of doubles, which is assumed to have a
second dimension of 2, and returns the copy.
@param oldArray the given 2-D double array.
@return copy of the given array. */
protected double[][] copyDouble2Darray(double[][] oldArray) {
// Initialise new array
double[][] newArray = new double[oldArray.length][2];
// Loop through old array
for(int index=0;index