/* -------------------------------------------------------------------------- */ /* */ /* DEISSION TREE */ /* */ /* Frans Coenen */ /* */ /* Thursday 8 November 2007 */ /* */ /* Bug fixes: Wednesday 28 January 2008 */ /* Updated: Tuesday 3 January 2012, Monday 9mJanuary 2012 */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ //package lucsKDD_ARM; /* To compile: javaARMpackc.exe decisionTree.java */ // Java packages // Java packages import java.util.*; import java.io.*; // Java GUI packages import javax.swing.*; /** decision tree classifier generator. @author Frans Coenen @version 3 January 2011 */ public class DecisionTree extends AssocRuleMining { /* ------ FIELDS ------ */ // Data structures /** 2-D array to hold the test data

Note that classifiaction involves producing a set of Classification Rules (CRs) from a training set and then testing the effectiveness of the CRs on a test set. */ protected short[][] testDataArray = null; /** 3-data array to hold 10th sets of input data.

Used in conjunction with "10 Cross Validation" where the input data is divided into 10 subsets and CRs are produced using each subset in turn and validated against the remaininmg 9 sets. The oveerall average accuracy is then the total accuracy divided by 10. */ protected short[][][] tenthDataSets = new short[10][][]; /** Attribute list from which selections are made. */ private int[] attributeList = null; /** Start reference to decision tree. */ private DecisionTreeNode root = null; // Other fields /** Number of rows in input data set, not the same as the number of rows in the classification training set.

Used for temporery storage of total number of rows when using Ten Cross Validation (TCV) approach only.

The numRows field inherited from the super class records is used throughout the CR generation process. Set to number of rows in the training set using setNumRowsInInputSet method called by application class. */ protected int numRowsInInputSet; /** Number of rows in training set, also not the same as the number of rows in the classification training set. */ private int numRowsInTrainingSet = 0; /** Number of rows in test set, again not the same as the number of rows in the classification training set. */ private int numRowsInTestSet = 0; /** Number of attributes. */ protected int numAttributes = 0; /** Number of columns in test set, used for diagnostic purposes only --- should be same as in training set. */ protected int numColsInTestSet; /** Average accuracy as the result of TCV. */ protected double averageAccuracy; /** Standard deviation from average accuracy. */ protected double sdAccuracy; /* ------ CONSTRUCTORS ------ */ /** Constructor with command line arguments to be process. @param args the command line arguments (array of String instances). */ public DecisionTree(String[] args) { super(args); } /* ---------------------------------------------------------------- */ /* */ /* COMMAND LINE ARGUMENTS */ /* */ /* ---------------------------------------------------------------- */ /* IDENTIFY ARGUMENT */ /** Identifies nature of individual command line agruments: -C = confidence, -F = file name, -N = number of classes -S = support, -T testFile name.

(Overides higher level method.) @param argument the given argument. */ protected void idArgument(String argument) { if (argument.length()<3) { JOptionPane.showMessageDialog(null,"Command line argument '" + argument + "' too short.","COMMAND LINE " + "INPUT ERROR",JOptionPane.ERROR_MESSAGE); errorFlag = false; } else if (argument.charAt(0) == '-') { char flag = argument.charAt(1); argument = argument.substring(2,argument.length()); switch (flag) { case 'F': fileName = argument; break; case 'N': numClasses = Integer.parseInt(argument); break; case 'T': testSetFileName = argument; break; default: JOptionPane.showMessageDialog(null,"Unrecognise command "+ "line argument: '" + flag + argument + "'.", "COMMAND LINE INPUT ERROR",JOptionPane.ERROR_MESSAGE); errorFlag = false; } } else { JOptionPane.showMessageDialog(null,"All command line arguments " + "must commence with a '-' character ('" + argument + "').", "COMMAND LINE INPUT ERROR",JOptionPane.ERROR_MESSAGE); errorFlag = false; } } /* CHECK INPUT ARGUMENTS */ /** Invokes methods to check values associated with command line arguments (overides higher level method). */ protected void CheckInputArguments() { // Check support and confidence input checkSupportAndConfidence(); // Check file name checkFileName(); // Check number of classes checkNumClasses(); // Return if (errorFlag) outputSettings(); else outputMenu(); } /* CHECK NUMBER OF CLASSES */ /** Checks if number of classes command line parameter has been set appropriately. */ private void checkNumClasses() { if (numClasses == 0) { JOptionPane.showMessageDialog(null,"Must specify number of " + "classes (-N)","COMMAND LINE INPUT ERROR", JOptionPane.ERROR_MESSAGE); errorFlag = false; } else if (numClasses < 0) { JOptionPane.showMessageDialog(null,"Number of classes must be a " + "positive integer","COMMAND LINE INPUT ERROR", JOptionPane.ERROR_MESSAGE); errorFlag = false; } } /* SET SUPPORT AND CONFIDENCE */ /** Sets new values for the support and confidence fields. @param newSupport the new support value. @param newConfidence the new confidence value. */ /*public void setSupportAndConfidence(double newSupport, double newConfidence) { support = newSupport; confidence = newConfidence; }*/ /* ---------------------------------------------------------------- */ /* */ /* DATA SET UTILITIES */ /* */ /* ---------------------------------------------------------------- */ /* REORDER INPUT DATA: */ /** Reorders input data according to frequency of single attributes but excluding classifiers which are left unordered at the end of the attribute list.

Overides method in AssocRuleMining class. Note reordering makes for more efficient executuion of the T-tree (and P-tree) algorithms. Identical method in AprioriTclass class. */ /*public void idInputDataOrdering() { // Count singles and store in countArray; int[][] countArray = countSingles(); // Bubble sort count array on support value (second index) orderFirstNofCountArray(countArray,numCols-numClasses); // Define conversion and reconversion arrays defConvertArrays(countArray,numCols-numClasses); // Set sorted flag isOrderedFlag = true; }*/ /* CREATE TRAINING AND TEST DATA SETS. */ /** Populates test and training datasets.

Note: (1) assumes a 50:50 split, (2) training data set is stored in the dataArray structure in which the input data is stored, (3) method called from application class as same training and test sets may be required if using (say) "hill climbing" approach to maximise accuracy, (4) method is not called from constructor partly for same reason as 3 but also because the input data set may (given a particular application) first require ordering and possibly also pruning and recasting (see recastClassifiers method). */ public void createTrainingAndTestDataSets() { // Determine size of training and test sets. final double percentageSizeOfTestSet = 50.0; numRowsInTestSet = (int) ((double) numRows* percentageSizeOfTestSet/100.0); numRowsInTrainingSet = numRows-numRowsInTestSet; numRows = numRowsInTrainingSet; numRules = 0; // Dimension and populate training set. short[][] trainingSet = new short[numRowsInTrainingSet][]; int index1=0; for (;index1 Note: this method is not called from the constructor as the input data set may (given a particular application) first require ordering (and possibly also pruning!). */ public void createTenthsDataSets() { // If number of rows is less than 10 cannot create appropriate data // sets if (numRows<10) { JOptionPane.showMessageDialog(null,"Only " + numRows + ", therefore cannot create tenths data sets!", "TCV ERROR",JOptionPane.ERROR_MESSAGE); System.exit(1); } // Determine size of first nine tenths data sets. int tenthSize = numRows/10; // Dimension first nine tenths data sets. int index=0; for( ;index Note: (1) works on a 9:1 split with nine of the tenths data sets forming the training set and the remaining one tenth the test set, (2) training data set is stored in the same dataArray structure in which the initial input data is stored, (3) this method is not called from the constructor as the input data set may (given a particular application) first require ordering and possibly also pruning. @param testSetIndex the index of the tenths data sets to be used as the test set. */ public void createTrainingAndTestDataSets(int testSetIndex) { // Dimension and populate test set. numRowsInTestSet = tenthDataSets[testSetIndex].length; testDataArray = tenthDataSets[testSetIndex]; System.out.println("numRowsInTestSet = " + numRowsInTestSet); // Dimension of and populate training set. numRowsInTrainingSet = numRowsInInputSet-numRowsInTestSet; System.out.println("numRowsInTrainingSet = " + numRowsInTrainingSet); numRows = numRowsInTrainingSet; short[][] trainingSet = new short[numRows][]; int trainingSetIndex=0; // Before test set for(int index=0;indextestSetFileName.

Note that it is assumed that no empty records are included. Proceeds as follows:

  1. Gets number of rows (lines) in file, checking format of each line (space separated integers), if incorrectly formatted line found inputFormatOkFlag set to false.
  2. Dimensions input array.
  3. Reads data
*/ private void readTestFile() { try { // Dimension data structure. getNumberOfLines is in AssocRuleMining // class. inputFormatOkFlag=true; numRowsInTestSet = getNumberOfLines(testSetFileName); if (inputFormatOkFlag) { testDataArray = new short[numRowsInTestSet][]; // Read file System.out.println("Reading test set input file: " + testSetFileName); readTestDataSet(); } else JOptionPane.showMessageDialog(null,"Unable to read file: " + testSetFileName + "\n","FILE INPUT ERROR", JOptionPane.ERROR_MESSAGE); } catch(IOException ioException) { JOptionPane.showMessageDialog(null,"Unable to read file","FILE " + "INPUT ERROR",JOptionPane.ERROR_MESSAGE); closeFile(); System.exit(1); } } /* READ TEST DATA SET */ /** Reads test data from file specified in command line argument. */ public void readTestDataSet() throws IOException { int rowIndex=0; // Open the file openFileName(testSetFileName); // Get first row. String line = fileInput.readLine(); // Preocess rest of file while (line != null) { // Process line if (!processTestSetLine(line,rowIndex)) break; // Increment first (row) index in 2-D data array rowIndex++; // get next line line = fileInput.readLine(); } // Close file closeFile(); } /* PROCESS TEST SET LINE */ /** Processes a line from the test set file and places it in the testDataArray structure. @param line the line to be processed from the input file @param rowIndex the index to the current location in the testDataArray structure. @rerturn true if successfull, false if empty record. */ private boolean processTestSetLine(String line, int rowIndex) { // If no line return false if (line==null) return(false); // Tokenise line StringTokenizer dataLine = new StringTokenizer(line); int numberOfTokens = dataLine.countTokens(); // Empty line or end of file found, return false if (numberOfTokens == 0) return(false); // Convert input string to a sequence of short integers short[] code = binConversion(dataLine,numberOfTokens); // Dimension row in 2-D dataArray int codeLength = code.length; testDataArray[rowIndex] = new short[codeLength]; // Assign to elements in row for (int colIndex=0;colIndex maxAttribute) maxAttribute = testDataArray[index][lastIndex]; } numColsInTestSet = maxAttribute; } /* RECAST INPUT DATA. */ /** Recasts the contents of the test data array so that each record is ordered according to conversion array.

Proceed as follows: 1) For each record in the test data array. Create an empty new itemSet array. 2) Place into this array attribute/column numbers that correspond to the appropriate equivalents contained in the conversion array. 3) Reorder this itemSet and return into the test data array. */ public void recastTestData() { short[] itemSet; int attribute; // Step through data array using loop construct for(int rowIndex=0;rowIndex0) posExampleList = new int[numPosExamples]; if (numNegExamples>0) negExampleList = new int[numNegExamples]; createNewExampleLists(nodeAtt,exampleList,posExampleList, negExampleList); // Generate positive branch node.posBranch = new DecisionTreeNode(); generateNextNode2(node.posBranch,posExampleList,attributeList); // Generate negative branch node.negBranch = new DecisionTreeNode(); generateNextNode2(node.negBranch,negExampleList,attributeList); } /** Continue to generate next decision tree node. @param node the current decision tree node. @param exampleList the currebt list (indexes) of examples. @param attributeList the cuttent list of attrbutes. */ private void generateNextNode2(DecisionTreeNode node, int[] exampleList, short[] attributeList) { // No more examples if (exampleList==null) node.attNumber = 0; // Example list only references one class else if (exampleListUniform(exampleList)) { int lastIndex = dataArray[exampleList[0]].length-1; short nodeClass = dataArray[exampleList[0]][lastIndex]; node.attNumber = nodeClass; } // Check attribute list else if (attributeList==null) { //System.out.println("attributeList is null"); short nodeClass = getDominantClass(exampleList); node.attNumber = nodeClass; } // Prooced by first removing attributes that do not appear in the // examples else { attributeList = pruneAttributeList(attributeList,exampleList); // All attributes in attribute list may have been // pruned because they do not appear in examples, // there for check for empty attribute list. if (attributeList==null) { //System.out.println("attributeList is null (after pruning)"); short nodeClass = getDominantClass(exampleList); node.attNumber = nodeClass; } else generateNextNode(node,attributeList,exampleList); } } /** Prunes the attribute list by removing attributes that do not appear in the examples. @param attributeList the cuttent list of attrbutes. @param exampleList the currebt list of examples. @return revise attribute list. */ private short[] pruneAttributeList(short[] attributeList, int[] exampleList) { int i=0; //outputIntArray("attribute list",attributeList); //outputIntArray("exampleList",exampleList); // Loop while(idomClassCount) { domClassIndex = i; domClassCount = classArray[i]; } } // Return short classNum = (short) (domClassIndex+numAttributes+1); return(classNum); } /** Selects attribute from attribute list on which to split (stub). @param attributeList the cuttent list of attrbutes. @param exampleList the currebt list of examples. @return the attribute index of the selected attribute. */ protected int selectAttribute(short[] attributeList, int[] exampleList) { return(0); } /** Removes indicated attribute from attribute list. @param index the location in the old attributes list of the attribute to be removed. @param oldAttList the old attribute list to be revised. @return new attribute list. */ private short[] newAttributeList(int index, short[] oldAttList) { // New list is empty if (oldAttList.length==1) return(null); // Create new list short[] newAttList = new short[oldAttList.length-1]; int j=0; for (int i=0;i0) { short[] consequent = new short[1]; consequent[0] = node.attNumber; insertRuleIntoRulelist(antecedent,consequent,0.0); numRules++; } } else { short[] newAntecedent = reallocInsert(antecedent,node.attNumber); generateRuleList(newAntecedent,node.posBranch); generateRuleList(antecedent,node.negBranch); } } /* ------------------------------------------------------------- */ /* */ /* CLASSIFIER */ /* */ /* ------------------------------------------------------------- */ /* TEST CLASSIFICATION */ /** Tests the generated classification rules using the test set and returns percentage accuracy. @return the perecentage accuarcy. */ protected double testClassification() { int correctClassCounter = 0; // Check if test data exists, if not return' 0' if (testDataArray==null) { System.out.println("WARNING: No test data"); return(0.0); } // Check if any classification rules have been generated, if not // return'0' if (startRulelist==null) { String s = "No classification rules generated!\n"; System.out.print(s); return(0.0); } // Loop through test set int index=0; for(;index Copy of method in AprioriTclass. @param itemset the record to be classified. @return the classification. */ protected short classifyRecordDefault(short[] itemSet) { return(classifyRecordDefault(itemSet,startRulelist)); } /** Continues process of searching through rule data looking for a rule antecedent which is a subset of the input set or the default rule (last rule). @param itemset the record to be classified. @param node the currentNode. @return the classification. */ protected short classifyRecordDefault(short[] itemSet, RuleNode node) { // Process node if (node != null) { // Left branch short consClass = classifyRecordDefault(itemSet,node.leftBranch); if (consClass!=0) return(consClass); // Node if ((node.ruleNumber==numRules) || (isSubset(node.antecedent,itemSet))) return(node.consequent[0]); // Right branch return(classifyRecordDefault(itemSet,node.rightBranch)); } // Return return(0); } /* -------------------------------------------------------------------- */ /* */ /* RULE LINKED LIST ORDERED ACCORDING TO SIZE OF ANTECEDENT */ /* */ /* -------------------------------------------------------------------- */ /* Methods for inserting rules into a linked list of rules ordered according to size of antecedent (largest first) and confidence (most confident first). Each rule described in terms of 3 fields: 1) Antecedent (an item set), 2) a consequent (an item set), 3) a confidence value (double).

The support field is not used. */ /* INSERT (ASSOCIATION/CLASSIFICATION) RULE INTO RULE LINKED LIST (ORDERED ACCORDING CONFIDENCE). */ /** Inserts an (association/classification) rule into the linkedlist of rules pointed at by startRulelist.

The list is ordered so that rules with highest confidence are listed first. If two rules have the same confidence the new rule will be placed after the existing rule. Thus, if using an Apriori approach to generating rules, more general rules will appear first in the list with more specific rules (i.e. rules with a larger antecedent) appearing later as the more general rules will be generated first. @param antecedent the antecedent (LHS) of the rule. @param consequent the consequent (RHS) of the rule. @param supportForRule the associated support value. */ protected void insertRuleIntoRulelist(short[] antecedent, short[] consequent, double confidenceForRule) { // Check for empty tree. if (startRulelist == null) startRulelist = new RuleNode(antecedent, consequent,confidenceForRule,0.0); // Otherwise "walk" tree else insertRuleIntoRulelist(startRulelist,antecedent,consequent, confidenceForRule); } /** Continues process of adding rule to binary tree according to size of antecedent @param currentNode the current location in the bin tree. @param antecedent the antecedent (LHS) of the rule. @param consequent the consequent (RHS) of the rule. @param orderingValue the associated support value. */ private void insertRuleIntoRulelist(RuleNode currentNode, short[] antecedent, short[] consequent, double orderingValue) { boolean addToLeftBranch = false; // Calculate selector if (antecedent.length>currentNode.antecedent.length) addToLeftBranch=true; else if (antecedent.length==currentNode.antecedent.length && orderingValue>currentNode.confidenceForRule) addToLeftBranch=true; // Left branch if (addToLeftBranch) { if (currentNode.leftBranch==null) currentNode.leftBranch = new RuleNode(antecedent,consequent,orderingValue,0.0); else insertRuleIntoRulelist(currentNode.leftBranch,antecedent, consequent,orderingValue); } // Right branch else { if (currentNode.rightBranch==null) currentNode.rightBranch = new RuleNode(antecedent,consequent,orderingValue,0.0); else insertRuleIntoRulelist(currentNode.rightBranch,antecedent, consequent,orderingValue); } } /* ---------------------------------------------------------------- */ /* */ /* SET METHODS */ /* */ /* ---------------------------------------------------------------- * /* SET NUM ROWS IN INPUT SET */ /** Assigns value of numTows field to the numRowsInInputSet field.

Used in conjunction with TCV to "remember" the overall number of rows in the input data set.

Usually called from application classes. */ public void setNumRowsInInputSet() { numRowsInInputSet = numRows; } /* ---------------------------------------------------------------- */ /* */ /* OUTPUT */ /* */ /* ---------------------------------------------------------------- */ /* OUTPUT MENU */ /** Outputs menu for command line arguments. (Overides higher level method) */ protected void outputMenu() { System.out.println(); System.out.println("-F = Training file name"); System.out.println("-N = Number of classes"); System.out.println("-T = Test file name (only required in two file mode)"); System.out.println(); // Exit System.exit(1); } /* OUTPUT SETTINGS */ /** Outputs command line values provided by user. (Overides higher level method.) */ protected void outputSettings() { System.out.println("SETTINGS\n--------"); System.out.println("Training File name = " + fileName); if (testSetFileName!=null) System.out.println("Test File name = " + testSetFileName); System.out.println("Number of classes = " + numClasses); System.out.println(); } /* OUTPUT NUMBER OF CLASSES */ /** Outputs number of classes. */ public void outputNumClasses() { System.out.println("Number of classes = " + numClasses); } /* OUTPUT TEST DATA TABLE */ /** Outputs stored test set input data set read from input data file. */ public void outputTestDataArray() { System.out.println("TEST DATA ARRAY\n---------------"); for(int index=0;index0) System.out.print(","); puncCounter++; } System.out.print(input[i]); } // End System.out.println("}\n"); } /** Outputs the decision tree. */ public void outputDecTree() { String nodeNum = "1"; outputDecTree(nodeNum,root); } /** Outpurts a decision tree node. @param nodeNum the currentNumber. @param node the current node. */ private void outputDecTree(String nodeNum, DecisionTreeNode node) { if (node.posBranch==null) { System.out.print(nodeNum + " Class = "); if (node.attNumber==0) System.out.println("unknown"); else System.out.println(node.attNumber); } else { System.out.println(nodeNum + " Node = " + node.attNumber); outputDecTree(nodeNum+".1",node.posBranch); outputDecTree(nodeNum+".2",node.negBranch); } } /** Outputs a given short array. @param s the string to go with it. @param input the given array. */ private void outputIntArray(String s, short[] input) { System.out.print(s + " = {"); if (input==null) { System.out.println("}\n"); return; } // Loop int puncCounter = 0; int maxPerLine = 20; for (int i=0;i0) System.out.print(","); puncCounter++; } System.out.print(input[i]); } // End System.out.println("}\n"); } /** Continues provess of outputting contents of rule binary (if any), asuming that the list represents a set of CRs, such that last rule is the default rule.

If field numRules is temporarily altered this method can be used to output a particular rule as the default. Overides similar method in AssocRuleMining.java. @param linkRuleNode the currentNode. */ public void outputRulesWithDefault(RuleNode linkRuleNode) { // Process node if (linkRuleNode != null) { // Check if at end if (linkRuleNode.ruleNumber>numRules) return; // Left branch outputRulesWithDefault(linkRuleNode.leftBranch); // Node System.out.print("(" + linkRuleNode.ruleNumber + ") "); if (linkRuleNode.ruleNumber==numRules) System.out.print("Default"); else outputItemSet(linkRuleNode.antecedent); System.out.print(" -> "); outputItemSet(linkRuleNode.consequent); System.out.println(); // Right branch outputRulesWithDefault(linkRuleNode.rightBranch); } } }