/* -------------------------------------------------------------------------- */ /* */ /* ASSOCIATION RULE DATA MINING */ /* */ /* Frans Coenen */ /* */ /* Wednesday 9 January 2003 */ /* (revised 21/1/2003, 14/2/2003, 2/5/2003, 2/7/2003, 3/2/2004, 8/5/2004, */ /* 1/2/2005, 3/2/2005) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* To compile: javac.exe AssocRuleMining.java */ // Java packages import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Set of utillities to support various Association Rule Mining (ARM) algorithms. @author Frans Coenen @version 1 February 2005 */ public class AssocRuleMining extends JFrame { /* ------ FIELDS ------ */ // Inner class for storing linked list of ARs or CARs as appropriate. protected class RuleNode { /** Antecedent of AR. */ protected short[] antecedent; /** Consequent of AR. */ protected short[] consequent; /** The confidence value associate with the rule represented by this node. */ double confidenceForRule=0.0; /** Link to next node */ RuleNode next = null; /** Three argument constructor @param ante the antecedent (LHS) of the AR. @param cons the consequent (RHS) of the AR. @param confValue the associated confidence value. */ protected RuleNode(short[] ante, short[]cons, double confValue) { antecedent = ante; consequent = cons; confidenceForRule = confValue; } } // Data structures /** The reference to start of the rule list. */ protected RuleNode startRulelist = null; /** 2-D aray to hold input data from data file. Note that within the data array records are numbered from zero, thus rexcord one has index 0 etc. */ protected short[][] dataArray = null; /** 2-D array used to renumber columns for input data in terms of frequency of single attributes (reordering will enhance performance for some ARM algorithms). */ protected int[][] conversionArray = null; /** 1-D array used to reconvert input data column numbers to their original numbering where the input data has been ordered to enhance computational efficiency. */ protected short[] reconversionArray = null; // Constants /** Minimum support value */ private static final double MIN_SUPPORT = 0.0; /** Maximum support value */ private static final double MAX_SUPPORT = 100.0; /** Maximum confidence value */ private static final double MIN_CONFIDENCE = 0.0; /** Maximum confidence value */ private static final double MAX_CONFIDENCE = 100.0; // Command line arguments with default values and associated fields. /** Command line argument for data file name. */ protected String fileName = null; /** Command line argument for number of columns. */ protected int numCols = 0; /** Command line argument for number of rows. */ protected int numRows = 0; /** Command line argument for % support (default = 20%). */ protected double support = 20.0; /** Minimum support value in terms of number of rows.

Set when input data is read and the number of records is known, */ protected double minSupport = 0; /** Command line argument for % confidence (default = 80%). */ protected double confidence = 80.0; /** The number of one itemsets (singletons). */ protected int numOneItemSets = 0; // Flags /** Error flag used when checking command line arguments (default = true). */ protected boolean errorFlag = true; /** Input format OK flag( default = true). */ protected boolean inputFormatOkFlag = true; /** Flag to indicate whether system has data or not. */ private boolean haveDataFlag = false; /** Flag to indicate whether input data has been sorted or not. */ protected boolean isOrderedFlag = false; /** Flag to indicate whether input data has been sorted and pruned or not. */ protected boolean isPrunedFlag = false; // Other fields /** The input stream. */ protected BufferedReader fileInput; /** The file path */ protected File filePath = null; /* ------ CONSTRUCTORS ------ */ /** Processes command line arguments */ public AssocRuleMining(String[] args) { // Process command line arguments for(int index=0;indexerrorFlag set to false. */ protected void checkSupportAndConfidence() { // Check Support if ((support < MIN_SUPPORT) || (support > MAX_SUPPORT)) { System.out.println("INPUT ERROR: Support must be specified " + "as a percentage (" + MIN_SUPPORT + " - " + MAX_SUPPORT + ")"); errorFlag = false; } // Check confidence if ((confidence < MIN_CONFIDENCE) || (confidence > MAX_CONFIDENCE)) { System.out.println("INPUT ERROR: Confidence must be " + "specified as a percentage (" + MIN_CONFIDENCE + " - " + MAX_CONFIDENCE + ")"); errorFlag = false; } } /* CHECK FILE NAME */ /** Checks if data file name provided, if not errorFlag set to false. */ protected void checkFileName() { if (fileName == null) { System.out.println("INPUT ERROR: Must specify file name (-F)"); errorFlag = false; } } /* ---------------------------------------------------------------- */ /* */ /* READ INPUT DATA FROM FILE */ /* */ /* ---------------------------------------------------------------- */ /* INPUT DATA SET */ /** Commences process of getting input data (GUI version also exists). */ public void inputDataSet() { // Read the file readFile(); // Check ordering (only if input format is OK) if (inputFormatOkFlag) { if (checkOrdering()) { System.out.println("Number of records = " + numRows); countNumCols(); System.out.println("Number of columns = " + numCols); minSupport = (numRows * support)/100.0; System.out.println("Min support = " + twoDecPlaces(minSupport) + " (records)"); } else { System.out.println("Error reading file: " + fileName + "\n"); closeFile(); System.exit(1); } } } /* READ FILE */ /** Reads input data from file specified in command line argument fileName.

Note that it is assumed that no empty records are included. Proceeds as follows:

  1. Gets number of rows (lines) in file, checking format of each line (space separated integers), if incorrectly formatted line found inputFormatOkFlag set to false.
  2. Dimensions input array.
  3. Reads data
*/ protected void readFile() { try { // Dimension data structure inputFormatOkFlag=true; numRows = getNumberOfLines(fileName); if (inputFormatOkFlag) { dataArray = new short[numRows][]; // Read file System.out.println("Reading input file: " + fileName); readInputDataSet(); } else System.out.println("Error reading file: " + fileName + "\n"); } catch(IOException ioException) { System.out.println("Error reading File"); closeFile(); System.exit(1); } } /* GET NUMBER OF LINES */ /** Gets number of lines/records in input file and checks format of each line. @param nameOfFile the filename of the file to be opened. @return the number of rows in the given file. */ protected int getNumberOfLines(String nameOfFile) throws IOException { int counter = 0; // Open the file if (filePath==null) openFileName(nameOfFile); else openFilePath(); // Loop through file incrementing counter // get first row. String line = fileInput.readLine(); while (line != null) { checkLine(counter+1,line); StringTokenizer dataLine = new StringTokenizer(line); int numberOfTokens = dataLine.countTokens(); if (numberOfTokens == 0) break; counter++; line = fileInput.readLine(); } // Close file and return closeFile(); return(counter); } /* CHECK LINE */ /** Check whether given line from input file is of appropriate format (space separated integers), if incorrectly formatted line found inputFormatOkFlag set to false. @param counter the line number in the input file. @param str the current line from the input file. */ protected void checkLine(int counter, String str) { for (int index=0;index dataArray structure. @param line the line to be processed from the input file @param rowIndex the index to the current location in the dataArray structure. @rerturn true if successfull, false if empty record. */ private boolean processInputLine(String line, int rowIndex) { // If no line return false if (line==null) return(false); // Tokenise line StringTokenizer dataLine = new StringTokenizer(line); int numberOfTokens = dataLine.countTokens(); // Empty line or end of file found, return false if (numberOfTokens == 0) return(false); // Convert input string to a sequence of short integers short[] code = binConversion(dataLine,numberOfTokens); // Dimension row in 2-D dataArray int codeLength = code.length; dataArray[rowIndex] = new short[codeLength]; // Assign to elements in row for (int colIndex=0;colIndex= itemSet[index+1]) { JOptionPane.showMessageDialog(null,"FILE FORMAT ERROR:\n" + "Attribute data in line " + lineNum + " not in numeric order"); return(false); } } // Default return return(true); } /* COUNT NUMBER OF COLUMNS */ /** Counts number of columns represented by input data. */ protected void countNumCols() { int maxAttribute=0; // Loop through data array for(int index=0;index maxAttribute) maxAttribute = dataArray[index][lastIndex]; } numCols = maxAttribute; numOneItemSets = numCols; // default value only } /* OPEN FILE NAME */ /** Opens input file using fileName (instance field). @param nameOfFile the filename of the file to be opened. */ protected void openFileName(String nameOfFile) { try { // Open file FileReader file = new FileReader(nameOfFile); fileInput = new BufferedReader(file); } catch(IOException ioException) { JOptionPane.showMessageDialog(this,"Error Opening File", "Error: ",JOptionPane.ERROR_MESSAGE); System.exit(1); } } /* OPEN FILE PATH */ /** Opens file using filePath (instance field). */ protected void openFilePath() { try { // Open file FileReader file = new FileReader(filePath); fileInput = new BufferedReader(file); } catch(IOException ioException) { JOptionPane.showMessageDialog(this,"Error Opening File", "Error: ",JOptionPane.ERROR_MESSAGE); System.exit(1); } } /* CLOSE FILE */ /** Close file fileName (instance field). */ protected void closeFile() { if (fileInput != null) { try { fileInput.close(); } catch (IOException ioException) { JOptionPane.showMessageDialog(this,"Error Closeing File", "Error: ",JOptionPane.ERROR_MESSAGE); System.exit(1); } } } /* BINARY CONVERSION. */ /** Produce an item set (array of elements) from input line. @param dataLine row from the input data file @param numberOfTokens number of items in row @return 1-D array of short integers representing attributes in input row */ protected short[] binConversion(StringTokenizer dataLine, int numberOfTokens) { short number; short[] newItemSet = null; // Load array for (int tokenCounter=0;tokenCounter < numberOfTokens;tokenCounter++) { number = new Short(dataLine.nextToken()).shortValue(); newItemSet = realloc1(newItemSet,number); } // Return itemSet return(newItemSet); } /* ---------------------------------------------------------------- */ /* */ /* REORDER DATA SET ACCORDING TO ATTRIBUTE FREQUENCY */ /* */ /* ---------------------------------------------------------------- */ /* REORDER INPUT DATA: */ /** Reorders input data according to frequency of single attributes.

Example, given the data set:

    1 2 5
    1 2 3
    2 4 5
    1 2 5
    2 3 5
    
This would produce a countArray (ignore index 0):
    +---+---+---+---+---+---+
    |   | 1 | 2 | 3 | 4 | 5 |
    +---+---+---+---+---+---+
    |   | 3 | 5 | 2 | 1 | 4 |
    +---+---+---+---+---+---+
    
Which sorts to:
    +---+---+---+---+---+---+
    |   | 2 | 5 | 1 | 3 | 4 |
    +---+---+---+---+---+---+
    |   | 5 | 4 | 3 | 2 | 1 |
    +---+---+---+---+---+---+
    
Giving rise to the conversion Array of the form (no index 0):
    +---+---+---+---+---+---+
    |   | 3 | 1 | 4 | 5 | 2 |
    +---+---+---+---+---+---+
    |   | 3 | 5 | 2 | 1 | 4 |
    +---+---+---+---+---+---+
    
Note that the second row here are the counts which no longer play a role in the conversion exercise. Thus to the new column number for column 1 is column 3 (i.e. the first vale at index 1). The reconversion array of the form:
    +---+---+---+---+---+---+
    |   | 2 | 5 | 1 | 3 | 4 |
    +---+---+---+---+---+---+		
    
*/ public void idInputDataOrdering() { // Count singles and store in countArray; int[][] countArray = countSingles(); // Bubble sort count array on support value (second index) orderCountArray(countArray); // Define conversion and reconversion arrays defConvertArrays(countArray); // Set sorted flag isOrderedFlag = true; } /* COUNT SINGLES */ /** Counts number of occurrences of each single attribute in the input data. @return 2-D array where first row represents column numbers and second row represents support counts. */ protected int[][] countSingles() { // Dimension and initialize count array int[][] countArray = new int[numCols+1][2]; for (int index=0;indexcountSingles method so that array is ordered according to frequency of single items. @param countArray The 2-D array returned by the countSingles method. */ private void orderCountArray(int[][] countArray) { int attribute, quantity; boolean isOrdered; int index; do { isOrdered = true; index = 1; while (index < (countArray.length-1)) { if (countArray[index][1] >= countArray[index+1][1]) index++; else { isOrdered=false; // Swap attribute = countArray[index][0]; quantity = countArray[index][1]; countArray[index][0] = countArray[index+1][0]; countArray[index][1] = countArray[index+1][1]; countArray[index+1][0] = attribute; countArray[index+1][1] = quantity; // Increment index index++; } } } while (isOrdered==false); } /* ORDER FIRST N ELEMENTS IN COUNT ARRAY */ /** Bubble sorts first N elements in count array produced by countSingles method so that array is ordered according to frequency of single items.

Used when ordering classification input data. @param countArray The 2-D array returned by the countSingles method. @param endIndex the index of the Nth element. */ protected void orderFirstNofCountArray(int[][] countArray, int endIndex) { int attribute, quantity; boolean isOrdered; int index; do { isOrdered = true; index = 1; while (index < endIndex) { if (countArray[index][1] >= countArray[index+1][1]) index++; else { isOrdered=false; // Swap attribute = countArray[index][0]; quantity = countArray[index][1]; countArray[index][0] = countArray[index+1][0]; countArray[index][1] = countArray[index+1][1]; countArray[index+1][0] = attribute; countArray[index+1][1] = quantity; // Increment index index++; } } } while (isOrdered==false); } /* DEFINE CONVERSION ARRAYS: */ /** Defines conversion and reconversion arrays. @param countArray The 2-D array sorted by the orderCcountArray method.*/ protected void defConvertArrays(int[][] countArray) { // Dimension arrays conversionArray = new int[numCols+1][2]; reconversionArray = new short[numCols+1]; // Assign values for(int index=1;indexProceed as follows: 1) For each record in the data array. Create an empty new itemSet array. 2) Place into this array attribute/column numbers that correspond to the appropriate equivalents contained in the conversion array. 3) Reorder this itemSet and return into the data array. */ public void recastInputData() { short[] itemSet; int attribute; // Step through data array using loop construct for(int rowIndex=0;rowIndex Proceed as follows: 1) For each record in the data array. Create an empty new itemSet array. 2) Place into this array any column numbers in record that are supported at the index contained in the conversion array. 3) Assign new itemSet back into to data array */ public void recastInputDataAndPruneUnsupportedAtts() { short[] itemSet; int attribute; // Step through data array using loop construct for(int rowIndex=0;rowIndex= minSupport) { itemSet = reallocInsert(itemSet, (short) conversionArray[attribute][0]); } } // Return new item set to data array dataArray[rowIndex] = itemSet; } } // Set isPrunedFlag (used with GUI interface) isPrunedFlag=true; // Reset number of one item sets field numOneItemSets = getNumSupOneItemSets(); } /* GET NUM OF SUPPORTE ONE ITEM SETS */ /** Gets number of supported single item sets (note this is not necessarily the same as the number of columns/attributes in the input set). @return Number of supported 1-item sets */ protected int getNumSupOneItemSets() { int counter = 0; // Step through conversion array incrementing counter for each // supported element found for (int index=1;index < conversionArray.length;index++) { if (conversionArray[index][1] >= minSupport) counter++; } // Return return(counter); } /* RESIZE INPUT DATA */ /** Recasts the input data sets so that only N percent is used. @param percentage the percentage of the current input data that is to form the new input data set (number between 0 and 100). */ public void resizeInputData(double percentage) { // Redefine number of rows numRows = (int) ((double) numRows*(percentage/100.0)); System.out.println("Recast input data, new num rows = " + numRows); // Dimension and populate training set. short[][] trainingSet = new short[numRows][]; for (int index=0;index The support field is not used. */ /* INSERT (ASSOCIATION/CLASSIFICATION) RULE INTO RULE LINKED LIST (ORDERED ACCORDING CONFIDENCE). */ /** Inserts an (association/classification) rule into the linkedlist of rules pointed at by startRulelist.

The list is ordered so that rules with highest confidence are listed first. If two rules have the same confidence the new rule will be placed after the existing rule. Thus, if using an Apriori approach to generating rules, more general rules will appear first in the list with more specific rules (i.e. rules with a larger antecedent) appearing later as the more general rules will be generated first. @param antecedent the antecedent (LHS) of the rule. @param consequent the consequent (RHS) of the rule. @param confidenceForRule the associated confidence value. */ protected void insertRuleintoRulelist(short[] antecedent, short[] consequent, double confidenceForRule) { // Create new node RuleNode newNode = new RuleNode(antecedent,consequent, confidenceForRule); // Empty list situation if (startRulelist == null) { startRulelist = newNode; return; } // Add new node to start if (confidenceForRule > startRulelist.confidenceForRule) { newNode.next = startRulelist; startRulelist = newNode; return; } // Add new node to middle RuleNode markerNode = startRulelist; RuleNode linkRuleNode = startRulelist.next; while (linkRuleNode != null) { if (confidenceForRule > linkRuleNode.confidenceForRule) { markerNode.next = newNode; newNode.next = linkRuleNode; return; } markerNode = linkRuleNode; linkRuleNode = linkRuleNode.next; } // Add new node to end markerNode.next = newNode; } /* ----------------------------------------------- */ /* */ /* ITEM SET INSERT AND ADD METHODS */ /* */ /* ----------------------------------------------- */ /* REALLOC INSERT */ /** Resizes given item set so that its length is increased by one and new element inserted. @param oldItemSet the original item set @param newElement the new element/attribute to be inserted @return the combined item set */ protected short[] reallocInsert(short[] oldItemSet, short newElement) { // No old item set if (oldItemSet == null) { short[] newItemSet = {newElement}; return(newItemSet); } // Otherwise create new item set with length one greater than old // item set int oldItemSetLength = oldItemSet.length; short[] newItemSet = new short[oldItemSetLength+1]; // Loop int index1; for (index1=0;index1 < oldItemSetLength;index1++) { if (newElement < oldItemSet[index1]) { newItemSet[index1] = newElement; // Add rest for(int index2 = index1+1;index2itemSet1 in itemSet2. */ protected short[] complement(short[] itemSet1, short[] itemSet2) { int lengthOfComp = itemSet2.length-itemSet1.length; // Return null if no complement if (lengthOfComp<1) return(null); // Otherwsise define combination array and determine complement short[] complement = new short[lengthOfComp]; int complementIndex = 0; for(int index=0;indexcombinations method to calculate all possible combinations of a given item set.

For example given the item set [1,2,3] this will result in the combinations[[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]]. @param inputSet the given item set. @return array of arrays representing all possible combinations (may be null if no combinations). */ protected short[][] combinations(short[] inputSet) { if (inputSet == null) return(null); else { short[][] outputSet = new short[getCombinations(inputSet)][]; combinations(inputSet,0,null,outputSet,0); return(outputSet); } } /** Recursively calculates all possible combinations of a given item set. @param inputSet the given item set. @param inputIndex the index within the input set marking current element under consideration (0 at start). @param sofar the part of a combination determined sofar during the recursion (null at start). @param outputSet the combinations collected so far, will hold all combinations when recursion ends. @param outputIndex the current location in the output set. @return revised output index. */ private int combinations(short[] inputSet, int inputIndex, short[] sofar, short[][] outputSet, int outputIndex) { short[] tempSet; int index=inputIndex; // Loop through input array while(index < inputSet.length) { tempSet = realloc1(sofar,inputSet[index]); outputSet[outputIndex] = tempSet; outputIndex = combinations(inputSet,index+1, copyItemSet(tempSet),outputSet,outputIndex+1); index++; } // Return return(outputIndex); } /* GET COMBINATTIONS */ /** Gets the number of possible combinations of a given item set. @param set the given item set. @return number of possible combinations. */ private int getCombinations(short[] set) { int counter=0, numComb; numComb = (int) Math.pow(2.0,set.length)-1; // Return return(numComb); } /* ---------------------------------------------------------------- */ /* */ /* MISCELANEOUS */ /* */ /* ---------------------------------------------------------------- */ /* COPY ITEM SET */ /** Makes a copy of a given itemSet. @param itemSet the given item set. @return copy of given item set. */ protected short[] copyItemSet(short[] itemSet) { // Check whether there is a itemSet to copy if (itemSet == null) return(null); // Do copy and return short[] newItemSet = new short[itemSet.length]; for(int index=0;index Used for diagnostic purposes. @param dataSet the five array of arrays. */ protected void outputDataArray(short[][] dataSet) { if (dataSet==null) { System.out.println("null"); return; } // Loop through data array for(int index=0;index "); outputItemSet(rule.consequent); } /* OUTPUT RULE LINKED LIST WITH DEFAULT */ /** Outputs contents of rule linked list (if any), with reconversion, such that last rule is the default rule. */ public void outputRulesWithDefault() { int number = 1; RuleNode linkRuleNode = startRulelist; while (linkRuleNode != null) { // Output rule number System.out.print("(" + number + ") "); // Output antecedent if (linkRuleNode.next==null) System.out.print("Default -> "); else { outputItemSet(linkRuleNode.antecedent); System.out.print(" -> "); } // Output concequent outputItemSet(linkRuleNode.consequent); System.out.println(" " + twoDecPlaces(linkRuleNode.confidenceForRule) + "%"); // Increment parameters number++; linkRuleNode = linkRuleNode.next; } } /* --------------------------------- */ /* */ /* DIAGNOSTIC OUTPUT */ /* */ /* --------------------------------- */ /* OUTPUT DURATION */ /** Outputs difference between two given times. @param time1 the first time. @param time2 the second time. @return duration. */ public double outputDuration(double time1, double time2) { double duration = (time2-time1)/1000; System.out.println("Generation time = " + twoDecPlaces(duration) + " seconds (" + twoDecPlaces(duration/60) + " mins)"); // Return return(duration); } /* -------------------------------- */ /* */ /* OUTPUT UTILITIES */ /* */ /* -------------------------------- */ /* TWO DECIMAL PLACES */ /** Converts given real number to real number rounded up to two decimal places. @param number the given number. @return the number to two decimal places. */ protected double twoDecPlaces(double number) { int numInt = (int) ((number+0.005)*100.0); number = ((double) numInt)/100.0; return(number); } }