/* -------------------------------------------------------------------------- */ /* */ /* CLASSIFICATION */ /* */ /* Frans Coenen */ /* */ /* Wednesday 4 February 2004 */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Class structure AssocRuleMining | +-- Classification */ // Java packages import java.util.*; import java.io.*; /** Methods to produce classification rules using FOIL style algorithms. Assumes that input dataset is organised such that classes are at the end of each record. Note: number of classes value is stored in the numClasses field. @author Frans Coenen @version 29 April 2003 */ public class Classification extends AssocRuleMining { /* ------ FIELDS ------ */ // Data structures /** 2-D array to hold the test data set.

Classification involves producing a set of Classification Rules (CRs) from a training set and then testing the effectiveness of the CRs on a test set. */ protected short[][] testDataArray = null; /** 3-D array to hold 10th sets of input data.

Used in conjunction with "10 Cross Validation" where the input data is divided into 10 subsets and CRs are produced using each subset in turn and validated against the remaining 9 sets. The overall average accuracy is then the total accuracy divided by 10. */ protected short[][][] tenthDataSets = new short[10][][]; /** 1-D data array to hold classifiers (can not call it classes because this is a reserved word!). */ protected short[] classifiers = null; /** 2-D attribute array, first row contains '0.0' or '1.0' (0.0=available, 1.0=unavailable), second row contains gain.

Used in FOIL, PRM and CPAR. */ protected double[][] attributes = null; /** Temporary 2-D attribute array.

Used in FOIL, PRM and CPAR. */ protected double[][] attributes2 = null; // Constants /** The maximum number of rules to be considered per class when classifying records during testing. Usually set to 5. */ protected final int K_VALUE = 5; /** The minimum best gain acceptable for the process of the generation of a single rule to continue.

Used in FOIL, PRM and CPAR.*/ protected final double MIN_BEST_GAIN=0.7; // Other fields /** Number of rows in input data set.

Note this is not the same as the number of rows in the classification training set. Used for temporary storage of total number of rows when using Ten Cross Validation (TCV) approach only.

The numRows field inherited from the super class records is used throughout the CR generation process. Set to number of rows using setNumRowsInInputSet method called by application class. */ protected int numRowsInInputSet; /** Number of rows in test set, not the same as the number of rows in the classification training set. */ protected int numRowsInTestSet; /** Number of rows in training set, not the same as the number of rows in the classification test set. */ protected int numRowsInTrainingSet; /** Instance of the class RuleList */ protected RuleList currentRlist = null; /* ------ CONSTRUCTORS ------ */ /** Constructor for Classification class which also processes command line arguments. @param args the command line arguments (array of String instances). */ public Classification(String[] args) { super(args); // Create RuleList object for later usage currentRlist = new RuleList(); } /* ------ METHODS ------ */ /** Starts classification rule generation process. @return The classification accuracy (%). */ public double startClassification() { /* STUB */ return(0.0); // Default } /* ------------------------------------------------------------ */ /* */ /* GAIN CALCULATIONS */ /* */ /* ------------------------------------------------------------ */ /* FOIL, PRM and CPAR all use the concept of "gain" to judge whether the inclusion of an attribute within a rule "buys" anything. How gain values are calculated and used depends on the algorithm bit methods included here are general methods used by all three algorithms. */ /* GET BEST GAIN */ /** Process attributes array to identify the best gain @return the index of the best attribute, if no available attributes or will return 0. */ protected int getBestGain() { int indexOfBestAttribute = 0; double bestGainSofar = 0.0; // Loop through attributes2 array for (int index=1;indexbestGainSofar) { indexOfBestAttribute = index; bestGainSofar = attributes2[index][1]; } } } // Return return(indexOfBestAttribute); } /* ---------------------------------------------------------------- */ /* */ /* TEST CLASSIFICATION */ /* */ /* ---------------------------------------------------------------- */ /* TEST CLASSIFICATION */ /** Tests the generated classification rules using test sets and return percentage accuracy. @param the percentage accuracy. */ protected double testClassification() { int correctClassCounter = 0; int wrongClassCounter = 0; int unclassifiedCounter = 0; // Check if test data exists, if not return' 0' if (testDataArray==null) { System.out.println("ERROR: No test data"); return(0); } // Check if any classification rules have been generated, if not // return'0' if (currentRlist.startRulelist==null) { System.out.println("No classification rules generated!"); return(0); } // Loop through test set int index=0; for(;index Note: (1) training data set is stored in the dataArray structure in which the input data was originally stored, (2) method called from application class not from Classification class constructor because the input data set may (given a particular application) first require ordering and possibly also pruning and recasting (see recastClassifiers method). */ public void createTrainingAndTestDataSets() { // Determine size of training and test sets. final double PERCENTAGE_SIZE_OF_TEST_SET = 50.0; numRowsInTestSet = (int) ((double) numRows* PERCENTAGE_SIZE_OF_TEST_SET/100.0); numRowsInTrainingSet = numRows-numRowsInTestSet; numRows = numRowsInTrainingSet; // Output System.out.println("Num. of recs. in training set = " + numRowsInTrainingSet + "\nNum. of recs. " + "in test set = " + numRowsInTestSet); // Dimension and populate training set. short[][] trainingSet = new short[numRowsInTrainingSet][]; int index1=0; for (;index1 Note: (1) useing TCV (Ten Cross Validation), (2) training data set is stored in the dataArray structure in which the input data was originally stored, in which the initial input data is stored, (3) this method is not called from the constructor as the input data set may (given a particular application) first require ordering and possibly also pruning. @param testSetIndex the index of the tenths data sets to be used as the test set. */ private void createTrainingAndTestDataSets(int testSetIndex) { // Dimension and populate test set. numRowsInTestSet = tenthDataSets[testSetIndex].length; testDataArray = tenthDataSets[testSetIndex]; // Dimension of and populate training set. numRowsInTrainingSet = numRowsInInputSet-numRowsInTestSet; numRows = numRowsInTrainingSet; short[][] trainingSet = new short[numRows][]; int trainingSetIndex=0; // Before test set for(int index=0;index It is possible, using PRM or CPAR, to generate two or more rules with identical antecedents and consequents. This is because records are not removed from the N and P lists once a rule that satisfies them has been found, but instead these records simply have their weughting reduced. This reduced weighting forms part of the algorithm to calculate gains, however it is still possible for the same attributes to be selected to form the antecedent of a rule because these attributes (despite the reduce weighting) still produce the best gain. Eventually the weighting for the effected records is reduced so far that the attributes do not produce a best gain and are therefore not selected. Where this occurs the rules with the lower accuracy are removed from the rule list. */ protected void processRules() { // Remove duplicates from rule list removeDuplicateRules(); } /* REMOVE DIPLICAT RULES */ /** Processes linked list of rules and removes any duplicates. */ private void removeDuplicateRules() { RuleList.RuleNode linkRuleNode = currentRlist.startRulelist; // Loop through linked list looking for duplicates while (linkRuleNode != null) { removeDuplicate(linkRuleNode,linkRuleNode.antecedent, linkRuleNode.consequent); linkRuleNode = linkRuleNode.next; } } /* REMOVE DUPLICATE RULE */ /** Processes linked list of rules and removes any duplicate of the given rule. */ private void removeDuplicate(RuleList.RuleNode linkRef, short[] antecedent, short[] consequent) { RuleList.RuleNode markerNode = linkRef; RuleList.RuleNode linkRuleNode = linkRef.next; // Loop through linked list looking for duplicate rule while (linkRuleNode != null) { if (isEqual(antecedent,linkRuleNode.antecedent) && isEqual(consequent,linkRuleNode.consequent)) { markerNode.next = linkRuleNode.next; } else markerNode = linkRuleNode; linkRuleNode = linkRuleNode.next; } } /*----------------------------------------------------------------------- */ /* */ /* GET METHODS */ /* */ /*----------------------------------------------------------------------- */ /* GET CURRENT RULE LIST OBJECT */ /** Gets the current instance of the RuleList class. @return the current RuleList object. */ public RuleList getCurrentRuleListObject() { return(currentRlist); } /* ------------------------------------------------------------ */ /* */ /* MISCELANEOUS */ /* */ /* ------------------------------------------------------------ */ /* COPY DOUBLE 2-D ARRAY */ /** Copies the given 2-D array of doubles, which is assumed to have a second dimension of 2, and returns the copy. @param oldArray the given 2-D double array. @return copy of the given array. */ protected double[][] copyDouble2Darray(double[][] oldArray) { // Initialise new array double[][] newArray = new double[oldArray.length][2]; // Loop through old array for(int index=0;index