/* -------------------------------------------------------------------------- */ /* */ /* APRIORI-TFP CLASSIFICATION ASSOCIATION RULE (CAR) GENERATION */ /* */ /* Frans Coenen */ /* */ /* Tuesday 27 January 2004 */ /* (Revised 5/2/2004, 12/10/2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Class structure AssocRuleMining | +-- TotalSupportTree | +-- PartialSupportTree | +--AprioriTFPclass | +-- AprioriTFP_CARgen */ // Java packages import java.util.*; import java.io.*; // Java GUI packages import javax.swing.*; /** Methods to produce classification rules using a Apriori-TFP appraoch. Assumes that input dataset is orgnised such that classifiers are at the end of each record. CARs differ from ARs in that they have only a single consequent and that the number of admissable consequents is limited. Note that: (i) number of classifiers value is stored in the numClasses field, (ii) note that classifiers are assumed to be listed at the end of the attribute list. @author Frans Coenen @version 12 October 2006 */ public class AprioriTFP_CARgen extends AprioriTFPclass { /* ------ FIELDS ------ */ // CONSTANTS /** The maximum number of CARs */ protected final int MAX_NUM_CARS = 80000; // OTHER FIELDS /** The number of CARs generated so far. */ protected int numCarsSoFar = 0; // HILL CLIMBING FIELDS (ONLY USED BY SPECIFIC APPLICATIONS) // Constants /** Minimum value that support or confidence may be decremented to during hill climbing.
Note we do not want a support of 0.0 as in this case everything will be supported. */ protected static double minimumSupAndConfValue = 0.01; /** Maximum value that support or confidence may be incremented to during hill climbing. */ protected static final double MAXIMUM_SUP_OR_CONF_VALUE = 99.9; /** Ratio of best locations against available locations.
used in hill
climbing process to stop process when a majority of best locations have
been obtained. */
protected static final double BEST_TO_AVAILABLE_RATIO = 2.0/3.0;
// Hill climbing data structure
/** Data structure to hold information concerning a particular location in
the playing area. */
protected class HClocData {
/** Support coordinate */
double suppCoord = 0.0;
/** Confidence coordinate */
double confCoord = 0.0;
/** Associated accuracy */
double accuracy = 0.0;
/** Number of CRs */
int numberOfCRs = 0;
/** Flag indicating if location is outside playing area or not (true
by default) */
boolean inArea = true;
/** Flag indicating if accuracy for location has not been claculated
(true by default) */
boolean notCalculated=true;
/** Default constructor */
protected HClocData() {
}
/** Five argument constructor.
@param supp the Support coordinate
@param conf the Confidence coordinate
@param acc the associated accuracy
@param numCRs the number of clasification rules generated
@param locationFlag the indicator for whether the location is
outside palying area or not.
@param calcFlag the indicator for whether the accuracy for the location
has been claculated ort not. */
protected HClocData(double supp, double conf, double acc, int numCRs,
boolean locationFlag, boolean calcFlag) {
suppCoord = supp;
confCoord = conf;
accuracy = acc;
numberOfCRs = numCRs;
inArea = locationFlag;
notCalculated = calcFlag;
}
}
// Other fields used in hill climbing
/** The minimum support value (%) that can be arrived at through the hill
climbing process. */
protected double minimumDiffSupport = 0.1; // Default
/** The minimum confidence value (%) that can be arrived at through the
hill climbing process. */
protected double minimumDiffConfidence = 1.0; // Default
/** 1-D array to hold locatiopns for gill climbing process. */
protected HClocData[] locData = null;
/* ------ CONSTRUCTORS ------ */
/** Constructor processes command line arguments.
@param args the command line arguments (array of String instances). */
public AprioriTFP_CARgen(String[] args) {
super(args);
}
/** Constructor with argument from existing instance of class
AssocRuleMining.
@param armInstance the given instance of the AssocRuleMining
class. */
public AprioriTFP_CARgen(AssocRuleMining armInstance) {
super(armInstance);
}
/* ------ METHODS ------ */
/*----------------------------------------------------------------------- */
/* */
/* START CLASSIFICATION */
/* */
/*----------------------------------------------------------------------- */
/* START CLASSIFICATION */
/** Starts CAR generation proces using TFPC. */
public void startCARgeneration() {
// Output nature of algorithm
String s = "START CLASSIFICATION (BEST FIRST), TFPC CAR GENERATION " +
"WITH X-CHEKING\n-----------------------" +
"----------------------------------------\n" +
"Max number of CARS = " + MAX_NUM_CARS +
"\nMax size antecedent = " + MAX_SIZE_OF_ANTECEDENT + "\n";
// proceed
startCARgeneration(s);
}
/** Starts CAR generation proces using TFPC (GUI version).
@param tArea the text area to output data to. */
public void startCARgeneration(JTextArea tArea) {
// Set text area
textArea = tArea;
// proceed
startCARgeneration();
}
/** Starts CAR generation proces using TFPC (version with input string
argument).
@param s String to be output to GUI/Command line interface. */
protected void startCARgeneration(String s) {
if (textArea==null) { System.out.print(s); outputLimits(); }
else { textArea.append(s); outputLimits(textArea); }
// Continue CAR generation process
startCARgeneration2();
// Number Rule in BinTree
numberRulesInBinTree();
// Output generated rule set if requested
if (outputRuleSetToFileFlag) outputRulesToFile();
}
/** Continues process of genertaing CARS using apriori TFP. */
protected void startCARgeneration2() {
// Calculate minimum support threshold in terms of number of
// records in the training set.
minSupport = numRowsInTrainingSet*support/100.0;
String s = "Support = " + twoDecPlaces(support) + ", Confidence = " +
twoDecPlaces(confidence) + "\nMinimum support = " +
twoDecPlaces(minSupport) + " (Records)\n";
if (textArea==null) { System.out.print(s); outputLimits(); }
else { textArea.append(s); outputLimits(textArea); }
// Set rule list to null. Note that startRuleList is defined in the
// AssocRuleMining parent class and is also used to store Association
// Rules (ARs) with respect ARM.
startRulelist = null;
numCarsSoFar = 0;
// Create P-tree and then generate T-tree and generate CARs (method
// contained in PartialSupportTree class)
if (textArea==null) {
createPtree();
createTotalSupportTree();
}
else {
createPtree(textArea);
createTotalSupportTree(textArea);
}
}
/*----------------------------------------------------------------------- */
/* */
/* TEN CROSS VALIDATION (TCV) */
/* */
/*----------------------------------------------------------------------- */
/* COMMEMCE TEN CROSS VALIDATION WITH OUTPUT */
/** Start Ten Cross Validation (TCV) process with CAR generation. */
public void commenceTCVwithOutput() {
double[][] parameters = new double[10][4];
System.out.println("START TCV APRIORI-TFP CAR GENERATION\n" +
"------------------------------------");
System.out.println("Max number of CARS = " + MAX_NUM_CARS);
System.out.println("Max size antecedent = " + MAX_SIZE_OF_ANTECEDENT);
// Loop through tenths data sets
for (int index=0;index<10;index++) {
String s = "[--- " + (index+1) + " ---] ";
if (textArea==null) System.out.println(s);
else textArea.append(s);
// Create training and test sets
createTrainingAndTestDataSets(index);
// Mine data, produce T-tree and generate CRs
startCARgeneration();
// Set parameters
parameters[index][0] = numFrequentSets;
parameters[index][1] = numUpdates;
parameters[index][2] = calculateStorage();
parameters[index][3] = getNumRules();
}
// Determine totals
ouputTCVparam(parameters);
double totalNumFreqSets = 0;
double totalNumUpdates = 0;
double totalStorage = 0;
double totalNumCARs = 0;
for (int index=0;index CARs differ from ARs in that they have only a single consequent and that
the number of admissable consequents is limited. Note that classifiers are
assumed to be listed at the end of the attribute list.
@param level the current level in the T-tree. */
protected void generateCARs(int level) {
// Loop through classifiers
for (int index=numOneItemSets-numClasses+1;
index<=numOneItemSets;index++) {
// Check number of CARS generated so far
if (numCarsSoFar>MAX_NUM_CARS) {
String s = "Number of CARs (" + numCarsSoFar + ") generted " +
"so far exceeds limit of " + MAX_NUM_CARS +
", generation process stopped!\n";
if (textArea==null) System.out.println(s);
else textArea.append(s);
return;
}
// Else process
if (startTtreeRef[index]!=null &&
startTtreeRef[index].childRef!=null) {
if (startTtreeRef[index].support >= minSupport) {
short[] consequent = new short[1];
consequent[0] = (short) index;
generateCARs(null,index,level-1,consequent,
startTtreeRef[index].childRef);
}
}
}
}
/* GENERATE CLASSIFICATION ASSOCIATION RULES */
/** Continues process of generating classification association rules from
a T-tree by recursively looping through T-tree level by level.
@param itemSetSofar the label for a T-treenode as generated sofar.
@param size the length/size of the current array lavel in the T-tree.
@param level the current level in the T-tree
@param consequent the current consequent (classifier) for the CAR.
@param linkRef the reference to the current array lavel in the T-tree. */
protected void generateCARs(short[] itemSetSofar, int size, int level,
short[] consequent, TtreeNode[] linkRef) {
// If no more nodes return
if (linkRef == null) return;
// At right level
if (level==1) generateCARsRightLevel(itemSetSofar,size,consequent,
linkRef);
// Wrong level, Otherwise process
else generateCARsWrongLevel(itemSetSofar,size,consequent,level,
linkRef);
}
/* GENERATE CLASSIFICATION ASSOCIATION RULES (RIGHT LEVEL). */
/** Generating classificationh association rules from a given array of
T-tree nodes.
@param itemSetSofar the label for a T-treenode as generated sofar.
@param size the length/size of the current array lavel in the T-tree.
@param consequent the current consequent (classifier) for the CAR.
@param linkRef the reference to the current array lavel in the T-tree. */
protected void generateCARsRightLevel(short[] itemSetSofar, int size,
short[] consequent, TtreeNode[] linkRef) {
// Loop through T-tree array
for (int index=1; index < size; index++) {
// Check if node exists
if (linkRef[index] != null) {
// Generate Antecedent
short[] tempItemSet = realloc2(itemSetSofar,(short) index);
// Determine confidence
double confidenceForCAR = getConfidence(tempItemSet,
linkRef[index].support);
// Add CAR to linked list structure if confidence greater
// than minimum confidence threshold.
if (confidenceForCAR >= confidence) {
numCarsSoFar++;
insertRuleIntoRulelist(tempItemSet,consequent,
confidenceForCAR,linkRef[index].support);
}
}
}
}
/* GENERATE CLASSIFICATION ASSOCIATION RULES (WRONG LEVEL). */
/** Generating classificationh association rules from a given array of
T-tree nodes.
@param itemSetSofar the label for a T-treenode as generated sofar.
@param size the length/size of the current array lavel in the T-tree.
@param consequent the current consequent (classifier) for the CAR.
@param level the current level in the T-tree.
@param linkRef the reference to the current array lavel in the T-tree. */
protected void generateCARsWrongLevel(short[] itemSetSofar, int size,
short[] consequent, int level, TtreeNode[] linkRef) {
// Loop through T-tree array
for (int index=1; index < size; index++) {
// Check if node exists
if (linkRef[index] != null && linkRef[index].childRef!=null) {
short[] tempItemSet = realloc2(itemSetSofar,(short) index);
// Proceed down child branch
generateCARs(tempItemSet,index,level-1,consequent,
linkRef[index].childRef);
}
}
}
/* ========================================================== */
/* */
/* HILL CLIMBING METHODS */
/* */
/* ========================================================== */
/* Apriori-TFP classification with 8 point hill climbing. The hill
climbing approach assumes a 2D space of the form:
Support
^
|
|
|
|
+---------> Confidence
Thus increasing support has the effect of moving "north" and decreasing
support "south". We increase/decrease support/confidence using the dsupp
and doconf values. The algorithm thus moves through the space adjusting
support and confiodence coordinates with a view to maximising accuracy.
We test EIGHT points round a central location and pick either (i)
the central location to "zero in" on, or (ii) one pf the eight points to
move to. Alternatives that have been tried (at least with respect to
Apriori-TFP) include 4 point hill climbing, however the results achieved
have not been as good (although 4 point hill climbing requirers less
time). */
/* HILL CLIMBING */
/** Commences hill climbing process to maximise accuracy by adjusting
support and confidence thresholds. This is done by generating accuracies
for eight points round the current lcation:
Proceed as follows:
1) Determine best accuracy
2) Options:
i) If
2) Determine number of best accuracies:
a) If Only one best accuracy:
i) If best accuracy at center "zero" in
ii) Otherwise move to best accuracy and repaeat
b) If number of best accuracy locations equivalent to number of
available locations (i.e. excluding loctaions outside playing area)
emd.
c) Otherwise select best location out of available best accuracies.
@param dsupp the current support adjustment value.
@param dconf the current confidence adjustmernt value.
(currentSupp) and confidence (currentConf) thresholds.*/
protected void idHCoption(double dsupp, double dconf) {
// Find location with maximum accuracy (if more than one location have
// the same maximum accuracy then the first such location found is
// recorded).
int maxIndex = getLocationWithBestAccuracy();
double maxAccuracy = locData[maxIndex].accuracy;
String s = "Best Accuracy = " + twoDecPlaces(maxAccuracy) + "\n";
if (textArea==null) System.out.println(s);
else textArea.append(s);
// Determine number of location with the best accuracy
int numBestAccuracies = getNumBestAccuracyLocations(maxAccuracy);
// If only one best accuracy then either "zero in" or "move to new
// location as appropriate.
if (numBestAccuracies==1) {
moveOrZeroIn(maxIndex,locData[maxIndex].suppCoord,
locData[maxIndex].confCoord,dsupp,dconf,
locData[maxIndex].accuracy,locData[maxIndex].numberOfCRs);
return;
}
// Determine number of available locations (i.e. excluding locations
// outside the paying area), maximum is 9
int availableLocations = getNumAvailableLocations();
// If number of locations with "best accuracy" is equal to the number
// of available locations -1 (i.e. nearly all locations have best
// accuracy) then end.
if (numBestAccuracies>=
((double) availableLocations*BEST_TO_AVAILABLE_RATIO)) {
endHillClimbing(locData[maxIndex].suppCoord,
locData[maxIndex].confCoord,maxAccuracy,
locData[maxIndex].numberOfCRs);
return;
}
// Select one of the available locations with "best accuracy" and
// proceed
identifyNewLocation(dsupp,dconf,maxAccuracy);
}
/* GET LOCATION WITH BEST ACCURACY */
/** Finds the location with the best accuracy. Note that if there
exists more than one location with equal best accurcy the first such
location found returned.
@return index of location with best accuracy. */
private int getLocationWithBestAccuracy() {
double maxAccuracy = locData[8].accuracy; // Start max accuarcy
int maxIndex = 8; // Default
// Loop through location data
for(int index=7;index>=0;index--) {
if (locData[index].accuracy>maxAccuracy) {
maxAccuracy=locData[index].accuracy;
maxIndex=index;
}
}
// Return
return(maxIndex);
}
/* GET NUMBER OF BEST ACCURACY LOCATIONS */
/** Returns the number of locations that feature the identified best
accuracy.
@param maxAccuracy the identified best accuracy.
@return the number of "best accuracy" lcations. */
private int getNumBestAccuracyLocations(double maxAccuracy) {
int counter=0;
// Loop through location data
for(int index=0;index Procceds as follows:
1) If center is one of the best accuracy locations then zero in on center.
2) Otherwise, For each best accuracy increment a total counter (N, E, S or
W) according to location,
3) Identify a location by considering totals.
a) If identified location is a "best" location move to this location.
b) If identified location is not a "best" location (i.e. dies not have
the best accuracy associated with it) but centre is a
best location zero in on the center location.
c) Otherwise select location as follows by mobing outward in both a
clockwise and anti-clockwise direction from identified loaction.
@param dsupp the current support adjustment value.
@param dconf the current confidence adjustmernt value.
@param maxAccuracy the best accuracy so far. */
private void identifyNewLocation(double dsupp,
double dconf, double maxAccuracy) {
// If center is best location zero in on this location
if (similar2dec(locData[8].accuracy,maxAccuracy)) {
zeroInOnSuppAndConf(locData[8].suppCoord,locData[8].confCoord,
dsupp,dconf,locData[8].accuracy,
locData[8].numberOfCRs);
return;
}
// Determine total support and confidence values (exclude center)
int totalN=0, totalE=0, totalS=0, totalW=0;
for(int index=0;index
commence by testing whether given best location is at center or not,
if so move "north", otherwise proceed.
@param index the index of the best accuracy loaction .
@param maxAccuracy the best accuracy so far.
@param return index of "next best" location. */
private int findNextBestLocation(int index, double maxAccuracy) {
// If at center move "north" first
if (index==8) index=0;
// Try moving left (anti-clockwise)
return(findNextBestLocation(index,index,maxAccuracy));
}
/** Finds the next best location where the identified location is not a
"best" location (i.e. does not hava the maximum accuracy associated with it.
proceeds in a breadth first fasion moving out in both anti-clockwise
and clockwise direction direction from the current index.
@param indexLeft the index from which to move left (anti-clockwise).
@param indexRight the index from which to move right (clockwise).
@param maxAccuracy the best accuracy so far.
@param return index of "next best" location. */
private int findNextBestLocation(int indexLeft, int indexRight,
double maxAccuracy) {
// Try moving left (anti-clockwise)
if (indexLeft==0) indexLeft=7;
else indexLeft=indexLeft-1;
if (similar2dec(locData[indexLeft].accuracy,maxAccuracy))
return(indexLeft);
// Try Moving right (clockwise)
if (indexRight==7) indexRight=0;
else indexRight=indexRight+1;
if (similar2dec(locData[indexRight].accuracy,maxAccuracy))
return(indexRight);
// Repeat and return
return(findNextBestLocation(indexLeft,indexRight,maxAccuracy));
}
/* ZERO IN ON SUPPORT AND CONFIDENCE */
/** "Zero's" in on current support and confidence thresholds. Location
array indexes are as follows:
used
during hill climbing where support value may be increment/decrement out of
the "paying" area.
@param newSupp the given support value.
@return true if within range, and false otherwise. */
protected boolean checkSupport(double newSupp) {
if (newSupp<=minimumSupAndConfValue ||
newSupp>=MAXIMUM_SUP_OR_CONF_VALUE) return(false);
else return(true);
}
/* CHECK CONFIDENCE */
/** Checks whether adjusted confidence value is still within range.
used during hill climbing where confidence value may be increment/decrement
out of the "palying" area.
@param newConf the given confidence value.
@return true if within range, and false otherwise. */
protected boolean checkConfidence(double newConf) {
if (newConf<=minimumSupAndConfValue ||
newConf>=MAXIMUM_SUP_OR_CONF_VALUE) return(false);
else return(true);
}
/* MOVE TO NEW LOCATION */
/** Implements change in support or confidence value. Moving along the
N-S axis has the effiect of increasing/decreasing support, and moving along
the E-W axis the effect of increasing/decreasing condidence.
@param flag the idicator for whether we are increassing or lowering support
and/or confidence.
@param supp the current support threshold.
@param conf the current confidence threshold.
@param dsupp the current support adjustment value.
@param dconf the current confidence adjustmernt value.
@param accuracy the accuracy given the current support and confidence.
@param numCRs the number ogf generated classification rules given the
current support and confidence. */
protected void moveToNewLocation(int flag, double supp, double conf,
double dsupp, double dconf, double accuracy, int numCRs) {
switch (flag) {
case 0: // Increase support (north)
moveLocDataForIncSupp();
break;
case 1: // Increase support and confidence (north-east)
moveLocDataForIncSuppIncConf();
break;
case 2: // Increase confidence (east)
moveLocDataForIncConf();
break;
case 3: // Decrease support and increase confidence (south-east)
moveLocDataForIncSuppDecConf();
break;
case 4: // Decrease support (south))
moveLocDataForDecSupp();
break;
case 5: // Decrease support and confidence (south-west)
moveLocDataForDecSuppDecConf();
break;
case 6: // Decrease confidence (west)
moveLocDataForDecConf();
break;
case 7: // Increase support and decrease confidence (north-west)
moveLocDataForDecSuppIncConf();
break;
default:
JOptionPane.showMessageDialog(null,"Unexpected hill climbing " +
"error","HILL CLIMBING ERROR",JOptionPane.ERROR_MESSAGE);
System.exit(1);
}
if (textArea==null) System.out.println("MOVE");
else textArea.append("MOVE\n");
// Proceed with move
hillClimbing(supp,conf,dsupp,dconf,accuracy,numCRs);
}
/* -------------------------------------------------------------- */
/* */
/* LOCATION DATA ARRAY MANIPULATION METHODS */
/* */
/* -------------------------------------------------------------- */
/* CREATE LOCATION ARRAY */
/** Creates a location array to allow hill climbing approach to keep track
of where "it has been". */
public void createLoactaionArray() {
// Dimensionarray
locData = new HClocData[9];
// Populate grid with default values
for (int index=0;index Used for
diagnostic purposes only. */
protected void outputHClocData(){
String s = "---------------------------------\n";
if (textArea==null) System.out.println(s);
else textArea.append(s);
String[] labels = {" N","NE"," E","SE"," S","SW"," W","NW"," C"};
for (int index=0;index
7--------0--------1
| | |
+dsupp | | |
| | |
6--------8--------2
| | |
-dsupp | | |
| | |
5--------4--------3
-dconf +dconf
provided that we do not: (i) calculate an accuracy we have already
calculated (information available in the location data) or (ii) move
out of the search space.
@param currentSupp the current support threshold.
@param currentConf the current confidence threshold.
@param dsupp the current support adjustment value.
@param dconf the current confidence adjustmernt value.
@param currentAccuracy the accuracy given the current support
(currentSupp) and confidence (currentConf) thresholds.
@param currentNumCRs the number of rules generated given the current
support (currentSupp) and confidence (currentConf)
thresholds. */
protected void hillClimbing(double currentSupp, double currentConf,
double dsupp, double dconf, double currentAccuracy,
int currentNumCRs) {
// Stubb
}
/* ---------------------------------------------------------- */
/* */
/* MOVE OR ZERO IN */
/* */
/* ---------------------------------------------------------- */
/* IDENTIFY HILL CLIMBING OPTION */
/** Determines whether to implement a change in the value of the support or
confidence or to "zero in" on the current support and confidence.
+---+---+---+ ^
| 7 | 0 | 1 | | Support
+---+---+---+ |
| 6 | 8 | 2 |
+---+---+---+
| 5 | 4 | 3 |
+---+---+---+ ---> Confidende
@param currentSupp the current support threshold.
@param currentConf the current confidence threshold.
@param dsupp the current support adjustment value.
@param dconf the current confidence adjustmernt value.
@param currentAccuracy the accuracy given the current support
(currentSupp) and confidence (currentConf) thresholds.
@param currentNumCRs the number of classification rules generated given the
current support (currentSupp) and confidence
(currentConf) thresholds. */
private void zeroInOnSuppAndConf(double currentSupp, double currentConf,
double dsupp, double dconf, double currentAccuracy,
int currentNumCRs) {
if (textArea==null) System.out.println("ZERO IN");
else textArea.append("ZERO IN\n");
// Define new location array
createLoactaionArray();
locData[8] = new HClocData(currentSupp,currentConf,currentAccuracy,
currentNumCRs,true,false); // Start location done
// If 'dsupp' already at minimum, reduce confidence only, therefore
// do not calculate accuracies for indexes 0,1,3,4,5 and 7
if (dsupp < minimumDiffSupport) {
for (int index=0;index<6;index++) locData[index].inArea=false;
locData[7].inArea=false;
double newDconf = dconf/2.0;
hillClimbing(currentSupp,currentConf,dsupp,newDconf,
currentAccuracy,currentNumCRs);
return;
}
// If 'dconf' already at minimum, reduce support only, therefore
// do not calculate accuracies for indexes 1,2,3,5,6 and 7
if (dconf < minimumDiffConfidence) {
for (int index=1;index<4;index++) locData[index].inArea=false;
for (int index=5;index<8;index++) locData[index].inArea=false;
double newDsupp = dsupp/2.0;
hillClimbing(currentSupp,currentConf,newDsupp,dconf,
currentAccuracy,currentNumCRs);
return;
}
// Otherwise reduce both support and confidance
double newDconf = dconf/2.0;
double newDsupp = dsupp/2.0;
hillClimbing(currentSupp,currentConf,newDsupp,newDconf,
currentAccuracy,currentNumCRs);
}
/* CHECK SUPPORT */
/** Checks whether adjusted support value is still within range.