/* -------------------------------------------------------------------------- */ /* */ /* DEISSION TREE */ /* */ /* Frans Coenen */ /* */ /* Thursday 8 November 2007 */ /* */ /* Bug fixes: Wednesday 28 January 2008 */ /* Updated: Tuesday 3 January 2012, Monday 9mJanuary 2012 */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ //package lucsKDD_ARM; /* To compile: javaARMpackc.exe decisionTree.java */ // Java packages // Java packages import java.util.*; import java.io.*; // Java GUI packages import javax.swing.*; /** decision tree classifier generator. @author Frans Coenen @version 3 January 2011 */ public class DecisionTree extends AssocRuleMining { /* ------ FIELDS ------ */ // Data structures /** 2-D array to hold the test data
Note that classifiaction involves producing a set of Classification Rules (CRs) from a training set and then testing the effectiveness of the CRs on a test set. */ protected short[][] testDataArray = null; /** 3-data array to hold 10th sets of input data.
Used in conjunction with "10 Cross Validation" where the input data is divided into 10 subsets and CRs are produced using each subset in turn and validated against the remaininmg 9 sets. The oveerall average accuracy is then the total accuracy divided by 10. */ protected short[][][] tenthDataSets = new short[10][][]; /** Attribute list from which selections are made. */ private int[] attributeList = null; /** Start reference to decision tree. */ private DecisionTreeNode root = null; // Other fields /** Number of rows in input data set, not the same as the number of rows in the classification training set.
Used for temporery storage of total number of rows when using Ten Cross Validation (TCV) approach only.
The numRows field inherited from the super class records is used throughout the CR generation process. Set to number of rows in the training set using setNumRowsInInputSet method called by application class. */ protected int numRowsInInputSet; /** Number of rows in training set, also not the same as the number of rows in the classification training set. */ private int numRowsInTrainingSet = 0; /** Number of rows in test set, again not the same as the number of rows in the classification training set. */ private int numRowsInTestSet = 0; /** Number of attributes. */ protected int numAttributes = 0; /** Number of columns in test set, used for diagnostic purposes only --- should be same as in training set. */ protected int numColsInTestSet; /** Average accuracy as the result of TCV. */ protected double averageAccuracy; /** Standard deviation from average accuracy. */ protected double sdAccuracy; /* ------ CONSTRUCTORS ------ */ /** Constructor with command line arguments to be process. @param args the command line arguments (array of String instances). */ public DecisionTree(String[] args) { super(args); } /* ---------------------------------------------------------------- */ /* */ /* COMMAND LINE ARGUMENTS */ /* */ /* ---------------------------------------------------------------- */ /* IDENTIFY ARGUMENT */ /** Identifies nature of individual command line agruments: -C = confidence, -F = file name, -N = number of classes -S = support, -T testFile name.
(Overides higher level method.) @param argument the given argument. */ protected void idArgument(String argument) { if (argument.length()<3) { JOptionPane.showMessageDialog(null,"Command line argument '" + argument + "' too short.","COMMAND LINE " + "INPUT ERROR",JOptionPane.ERROR_MESSAGE); errorFlag = false; } else if (argument.charAt(0) == '-') { char flag = argument.charAt(1); argument = argument.substring(2,argument.length()); switch (flag) { case 'F': fileName = argument; break; case 'N': numClasses = Integer.parseInt(argument); break; case 'T': testSetFileName = argument; break; default: JOptionPane.showMessageDialog(null,"Unrecognise command "+ "line argument: '" + flag + argument + "'.", "COMMAND LINE INPUT ERROR",JOptionPane.ERROR_MESSAGE); errorFlag = false; } } else { JOptionPane.showMessageDialog(null,"All command line arguments " + "must commence with a '-' character ('" + argument + "').", "COMMAND LINE INPUT ERROR",JOptionPane.ERROR_MESSAGE); errorFlag = false; } } /* CHECK INPUT ARGUMENTS */ /** Invokes methods to check values associated with command line arguments (overides higher level method). */ protected void CheckInputArguments() { // Check support and confidence input checkSupportAndConfidence(); // Check file name checkFileName(); // Check number of classes checkNumClasses(); // Return if (errorFlag) outputSettings(); else outputMenu(); } /* CHECK NUMBER OF CLASSES */ /** Checks if number of classes command line parameter has been set appropriately. */ private void checkNumClasses() { if (numClasses == 0) { JOptionPane.showMessageDialog(null,"Must specify number of " + "classes (-N)","COMMAND LINE INPUT ERROR", JOptionPane.ERROR_MESSAGE); errorFlag = false; } else if (numClasses < 0) { JOptionPane.showMessageDialog(null,"Number of classes must be a " + "positive integer","COMMAND LINE INPUT ERROR", JOptionPane.ERROR_MESSAGE); errorFlag = false; } } /* SET SUPPORT AND CONFIDENCE */ /** Sets new values for the support and confidence fields. @param newSupport the new support value. @param newConfidence the new confidence value. */ /*public void setSupportAndConfidence(double newSupport, double newConfidence) { support = newSupport; confidence = newConfidence; }*/ /* ---------------------------------------------------------------- */ /* */ /* DATA SET UTILITIES */ /* */ /* ---------------------------------------------------------------- */ /* REORDER INPUT DATA: */ /** Reorders input data according to frequency of single attributes but excluding classifiers which are left unordered at the end of the attribute list.
Overides method in AssocRuleMining class. Note reordering makes for more efficient executuion of the T-tree (and P-tree) algorithms. Identical method in AprioriTclass class. */ /*public void idInputDataOrdering() { // Count singles and store in countArray; int[][] countArray = countSingles(); // Bubble sort count array on support value (second index) orderFirstNofCountArray(countArray,numCols-numClasses); // Define conversion and reconversion arrays defConvertArrays(countArray,numCols-numClasses); // Set sorted flag isOrderedFlag = true; }*/ /* CREATE TRAINING AND TEST DATA SETS. */ /** Populates test and training datasets.
Note: (1) assumes a 50:50
split, (2) training data set is stored in the dataArray structure in which
the input data is stored, (3) method called from application class as same
training and test sets may be required if using (say) "hill climbing"
approach to maximise accuracy, (4) method is not called from constructor
partly for same reason as 3 but also because the input data set may (given
a particular application) first require ordering and possibly also pruning
and recasting (see recastClassifiers method). */
public void createTrainingAndTestDataSets() {
// Determine size of training and test sets.
final double percentageSizeOfTestSet = 50.0;
numRowsInTestSet = (int) ((double) numRows*
percentageSizeOfTestSet/100.0);
numRowsInTrainingSet = numRows-numRowsInTestSet;
numRows = numRowsInTrainingSet;
numRules = 0;
// Dimension and populate training set.
short[][] trainingSet = new short[numRowsInTrainingSet][];
int index1=0;
for (;index1 Note that it is assumed that no empty records
are included. Proceeds as follows:
Proceed as follows:
1) For each record in the test data array. Create an empty new itemSet
array.
2) Place into this array attribute/column numbers that correspond to the
appropriate equivalents contained in the conversion array.
3) Reorder this itemSet and return into the test data array. */
public void recastTestData() {
short[] itemSet;
int attribute;
// Step through data array using loop construct
for(int rowIndex=0;rowIndex The support field is not used. */
/* INSERT (ASSOCIATION/CLASSIFICATION) RULE INTO RULE LINKED LIST (ORDERED
ACCORDING CONFIDENCE). */
/** Inserts an (association/classification) rule into the linkedlist of
rules pointed at by startRulelist. The list is ordered so that
rules with highest confidence are listed first. If two rules have the same
confidence the new rule will be placed after the existing rule. Thus, if
using an Apriori approach to generating rules, more general rules will
appear first in the list with more specific rules (i.e. rules with a larger
antecedent) appearing later as the more general rules will be generated
first.
@param antecedent the antecedent (LHS) of the rule.
@param consequent the consequent (RHS) of the rule.
@param supportForRule the associated support value. */
protected void insertRuleIntoRulelist(short[] antecedent,
short[] consequent, double confidenceForRule) {
// Check for empty tree.
if (startRulelist == null) startRulelist = new RuleNode(antecedent,
consequent,confidenceForRule,0.0);
// Otherwise "walk" tree
else insertRuleIntoRulelist(startRulelist,antecedent,consequent,
confidenceForRule);
}
/** Continues process of adding rule to binary tree according to size of
antecedent
@param currentNode the current location in the bin tree.
@param antecedent the antecedent (LHS) of the rule.
@param consequent the consequent (RHS) of the rule.
@param orderingValue the associated support value. */
private void insertRuleIntoRulelist(RuleNode currentNode,
short[] antecedent, short[] consequent, double orderingValue) {
boolean addToLeftBranch = false;
// Calculate selector
if (antecedent.length>currentNode.antecedent.length)
addToLeftBranch=true;
else if (antecedent.length==currentNode.antecedent.length &&
orderingValue>currentNode.confidenceForRule)
addToLeftBranch=true;
// Left branch
if (addToLeftBranch) {
if (currentNode.leftBranch==null) currentNode.leftBranch = new
RuleNode(antecedent,consequent,orderingValue,0.0);
else insertRuleIntoRulelist(currentNode.leftBranch,antecedent,
consequent,orderingValue);
}
// Right branch
else {
if (currentNode.rightBranch==null) currentNode.rightBranch = new
RuleNode(antecedent,consequent,orderingValue,0.0);
else insertRuleIntoRulelist(currentNode.rightBranch,antecedent,
consequent,orderingValue);
}
}
/* ---------------------------------------------------------------- */
/* */
/* SET METHODS */
/* */
/* ---------------------------------------------------------------- *
/* SET NUM ROWS IN INPUT SET */
/** Assigns value of numTows field to the
numRowsInInputSet field. Used in conjunction with TCV to
"remember" the overall number of rows in the input data set. Usually
called from application classes. */
public void setNumRowsInInputSet() {
numRowsInInputSet = numRows;
}
/* ---------------------------------------------------------------- */
/* */
/* OUTPUT */
/* */
/* ---------------------------------------------------------------- */
/* OUTPUT MENU */
/** Outputs menu for command line arguments. (Overides higher level method)
*/
protected void outputMenu() {
System.out.println();
System.out.println("-F = Training file name");
System.out.println("-N = Number of classes");
System.out.println("-T = Test file name (only required in two file mode)");
System.out.println();
// Exit
System.exit(1);
}
/* OUTPUT SETTINGS */
/** Outputs command line values provided by user. (Overides higher level
method.) */
protected void outputSettings() {
System.out.println("SETTINGS\n--------");
System.out.println("Training File name = " + fileName);
if (testSetFileName!=null) System.out.println("Test File name = " +
testSetFileName);
System.out.println("Number of classes = " + numClasses);
System.out.println();
}
/* OUTPUT NUMBER OF CLASSES */
/** Outputs number of classes. */
public void outputNumClasses() {
System.out.println("Number of classes = " + numClasses);
}
/* OUTPUT TEST DATA TABLE */
/** Outputs stored test set input data set read from input data file. */
public void outputTestDataArray() {
System.out.println("TEST DATA ARRAY\n---------------");
for(int index=0;index If field numRules is temporarily altered this
method can be used to output a particular rule as the default. Overides
similar method in AssocRuleMining.java.
@param linkRuleNode the currentNode. */
public void outputRulesWithDefault(RuleNode linkRuleNode) {
// Process node
if (linkRuleNode != null) {
// Check if at end
if (linkRuleNode.ruleNumber>numRules) return;
// Left branch
outputRulesWithDefault(linkRuleNode.leftBranch);
// Node
System.out.print("(" + linkRuleNode.ruleNumber + ") ");
if (linkRuleNode.ruleNumber==numRules) System.out.print("Default");
else outputItemSet(linkRuleNode.antecedent);
System.out.print(" -> ");
outputItemSet(linkRuleNode.consequent);
System.out.println();
// Right branch
outputRulesWithDefault(linkRuleNode.rightBranch);
}
}
}
*/
private void readTestFile() {
try {
// Dimension data structure. getNumberOfLines is in AssocRuleMining
// class.
inputFormatOkFlag=true;
numRowsInTestSet = getNumberOfLines(testSetFileName);
if (inputFormatOkFlag) {
testDataArray = new short[numRowsInTestSet][];
// Read file
System.out.println("Reading test set input file: " +
testSetFileName);
readTestDataSet();
}
else JOptionPane.showMessageDialog(null,"Unable to read file: " +
testSetFileName + "\n","FILE INPUT ERROR",
JOptionPane.ERROR_MESSAGE);
}
catch(IOException ioException) {
JOptionPane.showMessageDialog(null,"Unable to read file","FILE " +
"INPUT ERROR",JOptionPane.ERROR_MESSAGE);
closeFile();
System.exit(1);
}
}
/* READ TEST DATA SET */
/** Reads test data from file specified in command line argument. */
public void readTestDataSet() throws IOException {
int rowIndex=0;
// Open the file
openFileName(testSetFileName);
// Get first row.
String line = fileInput.readLine();
// Preocess rest of file
while (line != null) {
// Process line
if (!processTestSetLine(line,rowIndex)) break;
// Increment first (row) index in 2-D data array
rowIndex++;
// get next line
line = fileInput.readLine();
}
// Close file
closeFile();
}
/* PROCESS TEST SET LINE */
/** Processes a line from the test set file and places it in the
testDataArray structure.
@param line the line to be processed from the input file
@param rowIndex the index to the current location in the
testDataArray structure.
@rerturn true if successfull, false if empty record. */
private boolean processTestSetLine(String line, int rowIndex) {
// If no line return false
if (line==null) return(false);
// Tokenise line
StringTokenizer dataLine = new StringTokenizer(line);
int numberOfTokens = dataLine.countTokens();
// Empty line or end of file found, return false
if (numberOfTokens == 0) return(false);
// Convert input string to a sequence of short integers
short[] code = binConversion(dataLine,numberOfTokens);
// Dimension row in 2-D dataArray
int codeLength = code.length;
testDataArray[rowIndex] = new short[codeLength];
// Assign to elements in row
for (int colIndex=0;colIndex