/* -------------------------------------------------------------------------- */ /* */ /* ASSOCIATION RULE DATA MINING */ /* */ /* Frans Coenen */ /* */ /* Wednesday 9 January 2003 */ /* (revised 21/1/2003, 14/2/2003, 2/5/2003, 2/7/2003, 3/2/2004, 8/5/2004, */ /* 1/2/2005, 3/2/2005) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* To compile: javac.exe AssocRuleMining.java */ // Java packages import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Set of utillities to support various Association Rule Mining (ARM) algorithms. @author Frans Coenen @version 1 February 2005 */ public class AssocRuleMining extends JFrame { /* ------ FIELDS ------ */ // Inner class for storing linked list of ARs or CARs as appropriate. protected class RuleNode { /** Antecedent of AR. */ protected short[] antecedent; /** Consequent of AR. */ protected short[] consequent; /** The confidence value associate with the rule represented by this node. */ double confidenceForRule=0.0; /** Link to next node */ RuleNode next = null; /** Three argument constructor @param ante the antecedent (LHS) of the AR. @param cons the consequent (RHS) of the AR. @param confValue the associated confidence value. */ protected RuleNode(short[] ante, short[]cons, double confValue) { antecedent = ante; consequent = cons; confidenceForRule = confValue; } } // Data structures /** The reference to start of the rule list. */ protected RuleNode startRulelist = null; /** 2-D aray to hold input data from data file. Note that within the data array records are numbered from zero, thus rexcord one has index 0 etc. */ protected short[][] dataArray = null; /** 2-D array used to renumber columns for input data in terms of frequency of single attributes (reordering will enhance performance for some ARM algorithms). */ protected int[][] conversionArray = null; /** 1-D array used to reconvert input data column numbers to their original numbering where the input data has been ordered to enhance computational efficiency. */ protected short[] reconversionArray = null; // Constants /** Minimum support value */ private static final double MIN_SUPPORT = 0.0; /** Maximum support value */ private static final double MAX_SUPPORT = 100.0; /** Maximum confidence value */ private static final double MIN_CONFIDENCE = 0.0; /** Maximum confidence value */ private static final double MAX_CONFIDENCE = 100.0; // Command line arguments with default values and associated fields. /** Command line argument for data file name. */ protected String fileName = null; /** Command line argument for number of columns. */ protected int numCols = 0; /** Command line argument for number of rows. */ protected int numRows = 0; /** Command line argument for % support (default = 20%). */ protected double support = 20.0; /** Minimum support value in terms of number of rows.
Set when input
data is read and the number of records is known, */
protected double minSupport = 0;
/** Command line argument for % confidence (default = 80%). */
protected double confidence = 80.0;
/** The number of one itemsets (singletons). */
protected int numOneItemSets = 0;
// Flags
/** Error flag used when checking command line arguments (default =
true). */
protected boolean errorFlag = true;
/** Input format OK flag( default = true). */
protected boolean inputFormatOkFlag = true;
/** Flag to indicate whether system has data or not. */
private boolean haveDataFlag = false;
/** Flag to indicate whether input data has been sorted or not. */
protected boolean isOrderedFlag = false;
/** Flag to indicate whether input data has been sorted and pruned or
not. */
protected boolean isPrunedFlag = false;
// Other fields
/** The input stream. */
protected BufferedReader fileInput;
/** The file path */
protected File filePath = null;
/* ------ CONSTRUCTORS ------ */
/** Processes command line arguments */
public AssocRuleMining(String[] args) {
// Process command line arguments
for(int index=0;index Note that it is assumed
that no empty records are included. Proceeds as follows:
Example, given the data set:
Used when ordering classification input
data.
@param countArray The 2-D array returned by the countSingles
method.
@param endIndex the index of the Nth element. */
protected void orderFirstNofCountArray(int[][] countArray, int endIndex) {
int attribute, quantity;
boolean isOrdered;
int index;
do {
isOrdered = true;
index = 1;
while (index < endIndex) {
if (countArray[index][1] >= countArray[index+1][1]) index++;
else {
isOrdered=false;
// Swap
attribute = countArray[index][0];
quantity = countArray[index][1];
countArray[index][0] = countArray[index+1][0];
countArray[index][1] = countArray[index+1][1];
countArray[index+1][0] = attribute;
countArray[index+1][1] = quantity;
// Increment index
index++;
}
}
} while (isOrdered==false);
}
/* DEFINE CONVERSION ARRAYS: */
/** Defines conversion and reconversion arrays.
@param countArray The 2-D array sorted by the orderCcountArray
method.*/
protected void defConvertArrays(int[][] countArray) {
// Dimension arrays
conversionArray = new int[numCols+1][2];
reconversionArray = new short[numCols+1];
// Assign values
for(int index=1;index The list is ordered so that
rules with highest confidence are listed first. If two rules have the same
confidence the new rule will be placed after the existing rule. Thus, if
using an Apriori approach to generating rules, more general rules will
appear first in the list with more specific rules (i.e. rules with a larger
antecedent) appearing later as the more general rules will be generated
first.
@param antecedent the antecedent (LHS) of the rule.
@param consequent the consequent (RHS) of the rule.
@param confidenceForRule the associated confidence value. */
protected void insertRuleintoRulelist(short[] antecedent,
short[] consequent, double confidenceForRule) {
// Create new node
RuleNode newNode = new RuleNode(antecedent,consequent,
confidenceForRule);
// Empty list situation
if (startRulelist == null) {
startRulelist = newNode;
return;
}
// Add new node to start
if (confidenceForRule > startRulelist.confidenceForRule) {
newNode.next = startRulelist;
startRulelist = newNode;
return;
}
// Add new node to middle
RuleNode markerNode = startRulelist;
RuleNode linkRuleNode = startRulelist.next;
while (linkRuleNode != null) {
if (confidenceForRule > linkRuleNode.confidenceForRule) {
markerNode.next = newNode;
newNode.next = linkRuleNode;
return;
}
markerNode = linkRuleNode;
linkRuleNode = linkRuleNode.next;
}
// Add new node to end
markerNode.next = newNode;
}
/* ----------------------------------------------- */
/* */
/* ITEM SET INSERT AND ADD METHODS */
/* */
/* ----------------------------------------------- */
/* REALLOC INSERT */
/** Resizes given item set so that its length is increased by one
and new element inserted.
@param oldItemSet the original item set
@param newElement the new element/attribute to be inserted
@return the combined item set */
protected short[] reallocInsert(short[] oldItemSet, short newElement) {
// No old item set
if (oldItemSet == null) {
short[] newItemSet = {newElement};
return(newItemSet);
}
// Otherwise create new item set with length one greater than old
// item set
int oldItemSetLength = oldItemSet.length;
short[] newItemSet = new short[oldItemSetLength+1];
// Loop
int index1;
for (index1=0;index1 < oldItemSetLength;index1++) {
if (newElement < oldItemSet[index1]) {
newItemSet[index1] = newElement;
// Add rest
for(int index2 = index1+1;index2
For example given the item set [1,2,3] this will result in the
combinations[[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]].
@param inputSet the given item set.
@return array of arrays representing all possible combinations (may be null
if no combinations). */
protected short[][] combinations(short[] inputSet) {
if (inputSet == null) return(null);
else {
short[][] outputSet = new short[getCombinations(inputSet)][];
combinations(inputSet,0,null,outputSet,0);
return(outputSet);
}
}
/** Recursively calculates all possible combinations of a given item
set.
@param inputSet the given item set.
@param inputIndex the index within the input set marking current
element under consideration (0 at start).
@param sofar the part of a combination determined sofar during the
recursion (null at start).
@param outputSet the combinations collected so far, will hold all
combinations when recursion ends.
@param outputIndex the current location in the output set.
@return revised output index. */
private int combinations(short[] inputSet, int inputIndex,
short[] sofar, short[][] outputSet, int outputIndex) {
short[] tempSet;
int index=inputIndex;
// Loop through input array
while(index < inputSet.length) {
tempSet = realloc1(sofar,inputSet[index]);
outputSet[outputIndex] = tempSet;
outputIndex = combinations(inputSet,index+1,
copyItemSet(tempSet),outputSet,outputIndex+1);
index++;
}
// Return
return(outputIndex);
}
/* GET COMBINATTIONS */
/** Gets the number of possible combinations of a given item set.
@param set the given item set.
@return number of possible combinations. */
private int getCombinations(short[] set) {
int counter=0, numComb;
numComb = (int) Math.pow(2.0,set.length)-1;
// Return
return(numComb);
}
/* ---------------------------------------------------------------- */
/* */
/* MISCELANEOUS */
/* */
/* ---------------------------------------------------------------- */
/* COPY ITEM SET */
/** Makes a copy of a given itemSet.
@param itemSet the given item set.
@return copy of given item set. */
protected short[] copyItemSet(short[] itemSet) {
// Check whether there is a itemSet to copy
if (itemSet == null) return(null);
// Do copy and return
short[] newItemSet = new short[itemSet.length];
for(int index=0;index
*/
protected void readFile() {
try {
// Dimension data structure
inputFormatOkFlag=true;
numRows = getNumberOfLines(fileName);
if (inputFormatOkFlag) {
dataArray = new short[numRows][];
// Read file
System.out.println("Reading input file: " + fileName);
readInputDataSet();
}
else System.out.println("Error reading file: " + fileName + "\n");
}
catch(IOException ioException) {
System.out.println("Error reading File");
closeFile();
System.exit(1);
}
}
/* GET NUMBER OF LINES */
/** Gets number of lines/records in input file and checks format of each
line.
@param nameOfFile the filename of the file to be opened.
@return the number of rows in the given file. */
protected int getNumberOfLines(String nameOfFile) throws IOException {
int counter = 0;
// Open the file
if (filePath==null) openFileName(nameOfFile);
else openFilePath();
// Loop through file incrementing counter
// get first row.
String line = fileInput.readLine();
while (line != null) {
checkLine(counter+1,line);
StringTokenizer dataLine = new StringTokenizer(line);
int numberOfTokens = dataLine.countTokens();
if (numberOfTokens == 0) break;
counter++;
line = fileInput.readLine();
}
// Close file and return
closeFile();
return(counter);
}
/* CHECK LINE */
/** Check whether given line from input file is of appropriate format
(space separated integers), if incorrectly formatted line found
inputFormatOkFlag set to false.
@param counter the line number in the input file.
@param str the current line from the input file. */
protected void checkLine(int counter, String str) {
for (int index=0;index
1 2 5
1 2 3
2 4 5
1 2 5
2 3 5
This would produce a countArray (ignore index 0):
+---+---+---+---+---+---+
| | 1 | 2 | 3 | 4 | 5 |
+---+---+---+---+---+---+
| | 3 | 5 | 2 | 1 | 4 |
+---+---+---+---+---+---+
Which sorts to:
+---+---+---+---+---+---+
| | 2 | 5 | 1 | 3 | 4 |
+---+---+---+---+---+---+
| | 5 | 4 | 3 | 2 | 1 |
+---+---+---+---+---+---+
Giving rise to the conversion Array of the form (no index 0):
+---+---+---+---+---+---+
| | 3 | 1 | 4 | 5 | 2 |
+---+---+---+---+---+---+
| | 3 | 5 | 2 | 1 | 4 |
+---+---+---+---+---+---+
Note that the second row here are the counts which no longer play a role
in the conversion exercise. Thus to the new column number for column 1 is
column 3 (i.e. the first vale at index 1). The reconversion array of the
form:
+---+---+---+---+---+---+
| | 2 | 5 | 1 | 3 | 4 |
+---+---+---+---+---+---+
*/
public void idInputDataOrdering() {
// Count singles and store in countArray;
int[][] countArray = countSingles();
// Bubble sort count array on support value (second index)
orderCountArray(countArray);
// Define conversion and reconversion arrays
defConvertArrays(countArray);
// Set sorted flag
isOrderedFlag = true;
}
/* COUNT SINGLES */
/** Counts number of occurrences of each single attribute in the
input data.
@return 2-D array where first row represents column numbers
and second row represents support counts. */
protected int[][] countSingles() {
// Dimension and initialize count array
int[][] countArray = new int[numCols+1][2];
for (int index=0;index