/* -------------------------------------------------------------------------- */ /* */ /* ASSOCIATION RULE DATA MINING */ /* */ /* Frans Coenen */ /* */ /* Wednesday 9 January 2003 */ /* (revised 21/1/2003, 14/2/2003, 2/5/2003, 2/7/2003, 3/2/2004, 8/5/2004, */ /* 1/2/2005, 3/2/2005, 14/2/06, 14/3/06, 18/6/06, 1/7/2006, 1/6/2009) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ //package lucsKDD_ARM; /* To compile: javaARMpackc.exe AssocRuleMining.java */ // Java packages import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Set of utillities to support various Association Rule Mining (ARM) algorithms included in the LUCS-KDD suite of ARM programs. @author Frans Coenen @version 1 June 2008 */ public class AssocRuleMining extends JFrame { /* ------ FIELDS ------ */ // Data structures /** The reference to start of the rule list. */ protected RuleNode startRulelist = null; /** 2-D aray to hold input data from data file. Note that within the data array records are numbered from zero, thus record one has index 0 etc.
First index is row (record or TID) number starting from 0, and second is attribute (column) number starting from zero. */ protected short[][] dataArray = null; /** 2-D array used to renumber columns for input data in terms of frequency of single attributes (reordering will enhance performance for some ARM algorithms). */ protected int[][] conversionArray = null; /** 1-D array used to reconvert input data column numbers to their original numbering where the input data has been ordered to enhance computational efficiency. */ protected short[] reconversionArray = null; /** 1-D array to hold output schema. */ private String[] outputSchema = null; // Constants /** Minimum support value */ protected static final double MIN_SUPPORT = 0.0; /** Maximum support value */ protected static final double MAX_SUPPORT = 100.0; /** Maximum confidence value */ protected static final double MIN_CONFIDENCE = 0.0; /** Maximum confidence value */ protected static final double MAX_CONFIDENCE = 100.0; // Command line arguments with default values and associated fields. /** Command line argument for data file name. */ protected String fileName = null; /** File name for testset. */ protected String testSetFileName = null; /** Command line argument for number of columns (attributes) in input data. */ protected int numCols = 0; /** Command line argument for number of rows in input data. */ protected int numRows = 0; /** Command line argument for % support (default = 20%). */ protected double support = 20.0; /** Minimum support value in terms of number of rows.
Set when input
data is read and the number of records is known, reset if input data is
resized so that only N percent is used. */
protected double minSupport = 0;
/** Command line argument for % confidence (default = 80%). */
protected double confidence = 80.0;
/** The number of one itemsets (singletons). */
protected int numOneItemSets = 0;
/** Number of classes in input data set (input by the user). */
protected int numClasses = 0;
/** Number of rows in output schema. */
private int numRowsInOutputSchema = 0;
// Flags
/** Error flag used when checking command line arguments (default =
true). */
protected boolean errorFlag = true;
/** Input format OK flag( default = true). */
protected boolean inputFormatOkFlag = true;
/** Flag to indicate whether system has data or not. */
private boolean haveDataFlag = false;
/** Flag to indicate whether input data has been sorted or not. */
protected boolean isOrderedFlag = false;
/** Flag to indicate whether input data has been sorted and pruned or
not. */
protected boolean isPrunedFlag = false;
/** Flag to indicate whether output schema is available or not. */
protected boolean hasOutputSchemaFlag = false;
/** Support confidence framework flag. */
protected boolean supConfFworkFlag = false;
/** Support lift framework flag. */
protected boolean supLiftFworkFlag = false;
// Other fields
/** The input stream. */
protected BufferedReader fileInput;
/** The file path */
protected File filePath = null;
/* ------ CONSTRUCTORS ------ */
/** Constructor with command line arguments to be process.
@param args the command line arguments (array of String instances). */
public AssocRuleMining(String[] args) {
// Process command line arguments
for(int index=0;index Note that it is assumed
that no empty records are included. Proceeds as follows:
Similar to getNmberOfLines method above but without line checking.
@param fName the name of the output schema file.
@return the number of lines (attributes) in the output schema. */
protected int getNumLinesInOutputSchema(String fName) throws IOException {
int rowIndex=0;
// Open the file
if (filePath==null) openFileName(fName);
else openFilePath();
// Get first row.
String line = fileInput.readLine();
// Process rest of file
while (line != null) {
// Increment row index in output schema array
rowIndex++;
// get next line
line = fileInput.readLine();
}
// Close file and returm
closeFile();
return(rowIndex);
}
/* READ OUTPUT SCHEMA */
/** Reads output schema from file. */
public void readOutputSchema() throws IOException {
readOutputSchema(fileName);
}
/* READ OUTPUT SCEMA */
/** Reads outpur schema from given file.
@param fName the given file name. */
protected void readOutputSchema(String fName) throws IOException {
int rowIndex=0;
// Open the file
if (filePath==null) openFileName(fName);
else openFilePath();
// Get first row.
String line = fileInput.readLine();
// Process rest of file
while (line != null) {
outputSchema[rowIndex] = line;
// Increment row index in output schema array
rowIndex++;
// get next line
line = fileInput.readLine();
}
// Close file
closeFile();
}
/** Check if number of attributes in output schema are same as number of
attributes in input file. If either the output schema or the input
file has nor been loaded then method will return false.
@return true if number of attributes are the same and false otherwise. */
public boolean checkSchemaVdata() {
boolean schemaAndDataAttsSame = true;
// Check schema
if (outputSchema==null) {
JOptionPane.showMessageDialog(null,"CHECK SCHEMA v DATA " +
"ATTRIBUTES ERROR:\nNo output schema file.\n");
return(!schemaAndDataAttsSame);
}
// Check data array
if (dataArray==null) {
JOptionPane.showMessageDialog(null,"CHECK SCHEMA v DATA " +
"ATTRIBUTES ERROR:\nNo input data file.\n");
return(!schemaAndDataAttsSame);
}
// Check lengths.
if (outputSchema.length==numCols) return(schemaAndDataAttsSame);
else {
JOptionPane.showMessageDialog(null,"CHECK SCHEMA v DATA " +
"ATTRIBUTES ERROR:\nNumber of attributes in schema " +
"file (" + outputSchema.length + ") not same as\n" +
"number of attributes in data file (" +
numCols + ")\n");
return(!schemaAndDataAttsSame);
}
}
/* ---------------------------------------------------------------- */
/* */
/* READ INPUT DATA FROM FILE (GUI VERSIONS) */
/* */
/* ---------------------------------------------------------------- */
/* INPUT DATA SET */
/** Commences process of getting input data.
@param textArea the text area in the GUI used for output.
@param fName the name of the input file to be read. */
public void inputDataSet(JTextArea textArea, File fName) {
// Set filePath instance field
filePath = fName;
// Read the file
readFile(textArea);
// Check ordering (only if input format is OK)
if (inputFormatOkFlag) {
if (checkOrdering()) {
// Output to text area
textArea.append("Number of records = " + numRows + "\n");
countNumCols();
textArea.append("Number of columns = " + numCols + "\n");
// Set have data flag to true
haveDataFlag = true;
}
else {
// Set have data flag to false
haveDataFlag = false;
// Set inputFormatOkFlag to true by default for next input
// file
inputFormatOkFlag = true;
textArea.append("Error reading file: " + filePath + "\n\n");
}
}
}
/* READ FILE */
/** Reads input data from file specified in command line argument.
Proceeds as follows:
Example, given the data set:
Used when ordering classification input
data.
@param countArray The 2-D array returned by the countSingles
method.
@param endIndex the index of the Nth element. */
protected void orderFirstNofCountArray(int[][] countArray, int endIndex) {
int attribute, quantity;
boolean isOrdered;
int index;
do {
isOrdered = true;
index = 1;
while (index < endIndex) {
if (countArray[index][1] >= countArray[index+1][1]) index++;
else {
isOrdered=false;
// Swap
attribute = countArray[index][0];
quantity = countArray[index][1];
countArray[index][0] = countArray[index+1][0];
countArray[index][1] = countArray[index+1][1];
countArray[index+1][0] = attribute;
countArray[index+1][1] = quantity;
// Increment index
index++;
}
}
} while (isOrdered==false);
}
/* DEFINE CONVERSION ARRAYS: */
/** Defines conversion and reconversion arrays.
@param countArray The 2-D array sorted by the orderCcountArray
method.*/
protected void defConvertArrays(int[][] countArray) {
// Dimension arrays
conversionArray = new int[numCols+1][2];
reconversionArray = new short[numCols+1];
// Assign values by processing the count array which has now been
// ordered.
for(int index=1;index The list is ordered so that
rules with highest "ordering value" (this is assumed to be aconfidence
value but could equaly well be some other value such as lift) are listed
first. If two rules have the same ordering value the new rule will be
placed after the existing rule. Thus, if using an Apriori approach to
generating rules, more general rules will appear first in the list with
more specific rules (i.e. rules with a larger antecedent) appearing later
as the more general rules will be generated first.
@param antecedent the antecedent (LHS) of the rule.
@param consequent the consequent (RHS) of the rule.
@param orderingValue the associated support value. */
protected void insertRuleintoRulelist(short[] antecedent,
short[] consequent, double orderingValue) {
// Create new node
RuleNode newNode = new RuleNode(antecedent,consequent,orderingValue);
// Empty list situation
if (startRulelist == null) {
startRulelist = newNode;
return;
}
// Add new node to start
if (orderingValue > startRulelist.confidenceForRule) {
newNode.next = startRulelist;
startRulelist = newNode;
return;
}
// Add new node to middle
RuleNode markerNode = startRulelist;
RuleNode linkRuleNode = startRulelist.next;
while (linkRuleNode != null) {
if (orderingValue > linkRuleNode.confidenceForRule) {
markerNode.next = newNode;
newNode.next = linkRuleNode;
return;
}
markerNode = linkRuleNode;
linkRuleNode = linkRuleNode.next;
}
// Add new node to end
markerNode.next = newNode;
}
/* -------------------------------------------------------------- */
/* */
/* RULE LINKED LIST ORDERED ACCORDING TO SIZE OF ANTECEDENT */
/* */
/* -------------------------------------------------------------- */
/* INSERT (ASSOCIATION/CLASSIFICATION) RULE INTO RULE LINKED LIST 2
(ORDERED ACCORDING SIZE OF ANTECEDENT --- MORE SPECIFIC RULES FIRST). */
/** Inserts an (association/classification) rule into the linked list of
rules pointed at by startRulelist. List is ordered so that
more specific rules (i.e. rules with most attributes in their antecedent)
are listed first. **** NOT CURRENTLY USED ****
@param antecedent the antecedent (LHS) of the rule.
@param consequent the consequent (RHS) of the rule.
@param confidenceForRule the associated confidence value. */
protected void insertRuleintoRulelist2(short[] antecedent,
short[] consequent, double confidenceForRule) {
// Create new node
RuleNode newNode = new RuleNode(antecedent,consequent,
confidenceForRule);
// Empty list situation
if (startRulelist == null) {
startRulelist = newNode;
return;
}
// Add new node to start
if (antecedent.length > startRulelist.antecedent.length) {
newNode.next = startRulelist;
startRulelist = newNode;
return;
}
// Add new node to middle
RuleNode markerNode = startRulelist;
RuleNode linkRuleNode = startRulelist.next;
while (linkRuleNode != null) {
if (antecedent.length > linkRuleNode.antecedent.length) {
markerNode.next = newNode;
newNode.next = linkRuleNode;
return;
}
markerNode = linkRuleNode;
linkRuleNode = linkRuleNode.next;
}
// Add new node to end
markerNode.next = newNode;
}
/* ----------------------------------------------- */
/* */
/* ITEM SET INSERT AND ADD METHODS */
/* */
/* ----------------------------------------------- */
/* APPEND */
/** Concatenates two itemSets --- resizes given array so that its
length is increased by size of second array and second array added.
@param itemSet1 The first item set.
@param itemSet2 The item set to be appended.
@return the combined item set */
protected short[] append(short[] itemSet1, short[] itemSet2) {
// Test for empty sets, if found return other
if (itemSet1 == null) return(copyItemSet(itemSet2));
else if (itemSet2 == null) return(copyItemSet(itemSet1));
// Create new array
short[] newItemSet = new short[itemSet1.length+itemSet2.length];
// Loop through itemSet 1
int index1;
for(index1=0;index1 Note that
given itemSets may not be disjoint.
@param itemSet1 The first given item set.
@param itemSet2 the se4cond given item set.
@return the union of the two itemSets. */
protected short[] union(short[] itemSet1, short[] itemSet2) {
// check for null sets
if (itemSet1 == null) {
if (itemSet2 == null) return(null);
else return(itemSet2);
}
if (itemSet2 == null) return(itemSet1);
// determine size of union and dimension return itemSet
short[] newItemSet = new short[sizeOfUnion(itemSet1,itemSet2)];
// Loop through itemSets
int index1=0, index2=0, index3=0;
while (index1
For example given the item set [1,2,3] this will result in the
combinations[[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]].
@param inputSet the given item set.
@return array of arrays representing all possible combinations (may be null
if no combinations). */
protected short[][] combinations(short[] inputSet) {
if (inputSet == null) return(null);
else {
short[][] outputSet = new short[getCombinations(inputSet)][];
combinations(inputSet,0,null,outputSet,0);
return(outputSet);
}
}
/** Recursively calculates all possible combinations of a given item
set.
@param inputSet the given item set.
@param inputIndex the index within the input set marking current
element under consideration (0 at start).
@param sofar the part of a combination determined sofar during the
recursion (null at start).
@param outputSet the combinations collected so far, will hold all
combinations when recursion ends.
@param outputIndex the current location in the output set.
@return revised output index. */
private int combinations(short[] inputSet, int inputIndex,
short[] sofar, short[][] outputSet, int outputIndex) {
short[] tempSet;
int index=inputIndex;
// Loop through input array
while(index < inputSet.length) {
tempSet = realloc1(sofar,inputSet[index]);
outputSet[outputIndex] = tempSet;
outputIndex = combinations(inputSet,index+1,
copyItemSet(tempSet),outputSet,outputIndex+1);
index++;
}
// Return
return(outputIndex);
}
/* GET COMBINATTIONS */
/** Gets the number of possible combinations of a given item set.
@param set the given item set.
@return number of possible combinations. */
private int getCombinations(short[] set) {
int counter=0, numComb;
numComb = (int) Math.pow(2.0,set.length)-1;
// Return
return(numComb);
}
/* ---------------------------------------------------------------- */
/* */
/* MISCELANEOUS */
/* */
/* ---------------------------------------------------------------- */
/* COPY DATA ARRAY */
/** Makes a copy of the input data set.
@param itemSet the given item set.
@return copy of given item set. */
protected short[][] copyDataArray() {
return(copyItemSet(dataArray));
}
/* COPY ITEM SET */
/** Makes a copy of a given itemSet.
@param itemSet the given item set.
@return copy of given item set. */
protected short[] copyItemSet(short[] itemSet) {
// Check whether there is a itemSet to copy
if (itemSet == null) return(null);
// Do copy and return
short[] newItemSet = new short[itemSet.length];
for(int index=0;index Used where input has been
reordered so that most frequent attributes are listed first in which case,
without reconversion, the orinal attribute N will probably not be the new
attribute N. */
public void outputRulesWithReconversion() {
System.out.println("ERROR: outputRulesWithReconversion depricated" +
"use outputRules instead");
}
/* OUTPUT RULE LINKED LIST WITH DEFAULT */
/** Outputs contents of rule linked list (if any), asuming that the list
represents a set of CRs, such that last rule is the default rule. */
public void outputRulesWithDefault() {
// Check for emty rule list
if (startRulelist==null) {
System.out.println("NO RULES GENERATED");
return;
}
// Process list
int number = 1;
RuleNode linkRuleNode = startRulelist;
while (linkRuleNode != null) {
// Output rule number
System.out.print("(" + number + ") ");
// Output antecedent
if (linkRuleNode.next==null) System.out.print("Default -> ");
else {
outputItemSet(linkRuleNode.antecedent);
System.out.print(" -> ");
}
// Output concequent
outputItemSet(linkRuleNode.consequent);
System.out.println(" " +
twoDecPlaces(linkRuleNode.confidenceForRule));
// Increment parameters
number++;
linkRuleNode = linkRuleNode.next;
}
}
/* OUTPUT RULE LINKED LIST WITH DEFAULT (FIRST N) */
/** Outputs first N tule in rule linked list (if any), asuming that the
list represents a set of CRs, such that last rule is the default rule.
@param numberOfRulesToBeOutput the maximum number of rules to output. */
public void outputRulesWithDefault(int numberOfRulesToBeOutput) {
// Check for emty rule list
if (startRulelist==null) {
System.out.println("NO RULES GENERATED");
return;
}
// Process list
int number = 1;
RuleNode linkRuleNode = startRulelist;
while (numberOfRulesToBeOutput>=0) {
if (linkRuleNode== null) break;
// Output rule number
System.out.print("(" + number + ") ");
// Output antecedent
if (linkRuleNode.next==null) System.out.print("Default -> ");
else {
outputItemSet(linkRuleNode.antecedent);
System.out.print(" -> ");
}
// Output concequent
outputItemSet(linkRuleNode.consequent);
System.out.println(" " +
twoDecPlaces(linkRuleNode.confidenceForRule));
// Increment/decrement parameters
number++;
linkRuleNode = linkRuleNode.next;
numberOfRulesToBeOutput--;
}
}
/* OUTPUT RULE LINKED LIST WITH DEFAULT AND RECONVERSION */
/** Outputs contents of rule linked list (if any), asuming that the list
represents a set of CRs, such that last rule is the default rule with
reconversion. Used where input has been reordered so that most
frequent attributes are listed first in which case, without reconversion,
the orinal attribute N will probably not be the new attribute N. */
public void outputRulesWithDefaultRecon() {
System.out.println("ERROR: outputRulesWithDefaultRecon depricated " +
"use outputRulesWithDefault instead");
}
/* OUTPUT NUMBER OF RULES */
/** Outputs number of generated rules (ARs or CARS). */
public void outputNumRules() {
System.out.println("Number of rules = " + getNumCRs());
}
/* ----------------------------------------------------- */
/* */
/* GUI OUTPUT METHODS */
/* */
/* ----------------------------------------------------- */
/* OUTPUT DATA TABLE */
/** Outputs stored input data set; initially read from input data file, but
may be reordered or pruned if desired by a particular application.
@param textArea the text area. */
public void outputDataArray(JTextArea textArea) {
if (isPrunedFlag) textArea.append("DATA SET (Ordered and Pruned)\n" +
"-----------------------------\n");
else {
if (isOrderedFlag) textArea.append("DATA SET (Ordered)\n" +
"------------------\n");
else textArea.append("DATA SET\n" + "--------\n");
}
for(int index=0;index WARNING will overwrite
existing data if stored in the same directory as the application
exacutable, data files are better stored in a separate "DataFiles"
directory. */
public void outputDataArrayToFile() throws IOException{
//Determin file name
int fileNameIndex = fileName.lastIndexOf('/');
String shortFileName = fileName.substring(fileNameIndex+1,
fileName.length());
// Open file for writing
PrintWriter outputFile = new PrintWriter(new FileWriter(shortFileName));
// Write contents of Data array to file
for (int rowIndex = 0;rowIndex
*/
protected void readFile() {
try {
// Dimension data structure
inputFormatOkFlag=true;
numRows = getNumberOfLines(fileName);
if (inputFormatOkFlag) {
dataArray = new short[numRows][];
// Read file
System.out.println("Reading input file: " + fileName);
readInputDataSet();
}
else System.out.println("Error reading file: " + fileName + "\n");
}
catch(IOException ioException) {
System.out.println("Error reading File");
closeFile();
System.exit(1);
}
}
/* GET NUMBER OF LINES */
/** Gets number of lines/records in input file and checks format of each
line.
@param nameOfFile the filename of the file to be opened.
@return the number of rows in the given file. */
protected int getNumberOfLines(String nameOfFile) throws IOException {
int counter = 0;
// Open the file
if (filePath==null) openFileName(nameOfFile);
else openFilePath();
// Loop through file incrementing counter
// get first row.
String line = fileInput.readLine();
while (line != null) {
checkLine(counter+1,line);
StringTokenizer dataLine = new StringTokenizer(line);
int numberOfTokens = dataLine.countTokens();
if (numberOfTokens == 0) break;
counter++;
line = fileInput.readLine();
}
// Close file and return
closeFile();
return(counter);
}
/* CHECK LINE */
/** Check whether given line from input file is of appropriate format
(space separated integers), if incorrectly formatted line found
inputFormatOkFlag set to false.
@param counter the line number in the input file.
@param str the current line from the input file. */
protected void checkLine(int counter, String str) {
for (int index=0;index
@param textArea the text area in the gui used for output. */
public void readFile(JTextArea textArea) {
try {
// Dimension data structure
inputFormatOkFlag=true;
numRows = getNumberOfLines(fileName);
if (inputFormatOkFlag) {
dataArray = new short[numRows][];
// Read file
textArea.append("Reading input file:\n" + filePath + "\n");
readInputDataSet();
// Set have data flag to true
haveDataFlag = true;
}
else {
// Set have data flag to false
haveDataFlag = false;
textArea.append("Error reading file:\n" + filePath + "\n\n");
}
}
catch(IOException ioException) {
JOptionPane.showMessageDialog(this,"Error reading File",
"Error: ",JOptionPane.ERROR_MESSAGE);
textArea.append("Error reading File\n");
closeFile();
// Set have data flag to false
haveDataFlag = false;
}
}
/* ---------------------------------------------------------------- */
/* */
/* REORDER DATA SET ACCORDING TO ATTRIBUTE FREQUENCY */
/* */
/* ---------------------------------------------------------------- */
/* REORDER INPUT DATA: */
/** Reorders input data according to frequency of single attributes.
1 2 5
1 2 3
2 4 5
1 2 5
2 3 5
This would produce a countArray (ignore index 0 because there is no
attributr number 0):
+---+---+---+---+---+---+
| | 1 | 2 | 3 | 4 | 5 |
+---+---+---+---+---+---+
| | 3 | 5 | 2 | 1 | 4 |
+---+---+---+---+---+---+
Which sorts to:
+---+---+---+---+---+---+
| | 2 | 5 | 1 | 3 | 4 |
+---+---+---+---+---+---+
| | 5 | 4 | 3 | 2 | 1 |
+---+---+---+---+---+---+
Giving rise to the conversion Array of the form (no index 0):
+---+---+---+---+---+---+
| | 3 | 1 | 4 | 5 | 2 |
+---+---+---+---+---+---+
| | 3 | 5 | 2 | 1 | 4 |
+---+---+---+---+---+---+
Note that the first row gives the new attribute number (old attribute
number is the index). The second row here are the counts used to identify
the ordering but which now no longer play a role in the conversion
exercise. Thus the new column (attriburte) number for column/attribute 1 is
column 3 (i.e. the first vale at index 1). The reconversion array will be
of the form (values are the indexes from the conversion array while indexes
represent the first vlaue from the conversion array):
+---+---+---+---+---+---+
| | 2 | 5 | 1 | 3 | 4 |
+---+---+---+---+---+---+
For example to convert the attribute number 3 back to its original number
we look up the value at index 3.
*/
public void idInputDataOrdering() {
// Count singles and store in countArray;
int[][] countArray = countSingles();
// Bubble sort count array on support value (second index)
orderCountArray(countArray);
// Define conversion and reconversion arrays
defConvertArrays(countArray);
// Set sorted flag
isOrderedFlag = true;
}
/* COUNT SINGLES */
/** Counts number of occurrences of each single attribute in the
input data.
@return 2-D array where first row represents column numbers
and second row represents support counts. */
protected int[][] countSingles() {
// Dimension and initialize count array
int[][] countArray = new int[numCols+1][2];
for (int index=0;index