/* -------------------------------------------------------------------------- */ /* */ /* PRM (PREDICTIVE RULE MINING) CAR GENERATOR */ /* */ /* Frans Coenen */ /* */ /* Tuesday 3 February 2004 */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Class structure AssocRuleMining | +-- Classification | +-- PRM_CARgen */ // Java packages import java.util.*; import java.io.*; /** Methods to produce classification rules using PRM (Predictive Rule Mining) algorithm first proposed by Xiaoxin Yin and Jiawei Han. Assumes that input dataset is organised such that classifiers are at the end of each record. Note: number of classifiers value is stored in the numClasses field. @author Frans Coenen @version 3 February 2004 */ /* To compile: javaARMpackc.exe PRM_CARgen.java */ public class PRM_CARgen extends Classification { /* ------ FIELDS ------ */ // Nested classes /** Structure to store an example (positive or negative) records with weighting. */ protected class ExamplesStruct { /** Item set (example record). */ short[] itemSet; /** Weighting for the itemset. */ double weighting=1.0; /** One argument constructor @param record the itemset representing a record. */ protected ExamplesStruct(short[] record) { itemSet = copyItemSet(record); } /** Two argument constructor @param record the itemset representing a record. @param weight the weighting for the example record. */ protected ExamplesStruct(short[] record, double weight) { itemSet = copyItemSet(record); weighting = weight; } } // Data structures /** 2-D array to hold positive examples */ protected ExamplesStruct[] positiveExamples = null; /** 2-D array to hold negative examples */ protected ExamplesStruct[] negativeExamples = null; /** 2-D array to temporaily hold positive examples, what Xiaoxin Yin and Jiawei Han refer to as P'. */ protected ExamplesStruct[] positiveExamples2 = null; /** 2-D array to temporaily hold negative examples, what Xiaoxin Yin and Jiawei Han refer to as N'. */ protected ExamplesStruct[] negativeExamples2 = null; /** 2-D PNarray for storing size of current negative and positive example sets, what Xiaoxin Yin and Jiawei Han refer to as A. */ protected double[][] pn_array = null; /** 2-D temporary PN array for storing size of current negative and positive example sets, what Xiaoxin Yin and Jiawei Han refer to as A'. */ protected double[][] pn_array2 = null; // Constants /** Minimum total weight threshold */ protected final double TOTAL_WEIGHT_FACTOR=0.05; /** Weighting decrement */ protected final double DECAY_FACROR=1.0/3.0; /* ------ CONSTRUCTORS ------ */ /** Constructor processes command line arguments. @param args the command line arguments (array of String instances). */ public PRM_CARgen(String[] args) { super(args); } /* ------ METHODS ------ */ /* START CLASSIFICATION */ /** Starts classification rule generation process. @return The classification accuracy (%). */ public double startClassification() { System.out.println("START PRM CLASSIFICATION\n" + "------------------------"); // Set rule list to null. Note that startRuleList is defined in the // AssocRuleMining parent class and is also used to store Association // Rules (ARS) with respect to ARM. currentRlist.startRulelist = null; // Set DataArray and number of classes fields currentRlist.setDataArray(dataArray); currentRlist.setNumClasses(numClasses); // Check for classifier array if (classifiers==null) { System.out.println("ERROR: no classifiers array! To create a " + "classifiers array use createClassifiersArray(), " + "contained in ClassAprioriTserial class, called " + "from the application class."); System.exit(1); } // Start PRM generation process startPRM(); // Process rules processRules(); // Test classification using the test set and return accuracy. return(twoDecPlaces(testClassification())); } /* START PRM CLASSIFICATION */ /** Commences PRM process to generate CARs.

Proceeds as follows for each class: 1) Generate attribute array (for computational convenience only). 2) Generate P and N example data sets. 2) Calculate weighting threshold. 3) While current weighting for P is greater than threshold: a) Copy P, N and A to: get P', N' and A'. b) If first iteration copy PN to PN', else combine PN' positive weightings from previous iteration and original negative weightings to form new PN array. c) If not possible to generate further antecedent attributes break. d) Generate rule. */ private void startPRM() { // Generate attribute array attributes = new double[numOneItemSets-numClasses+1][2]; for(int attIndex=0;attIndextotalWeightThreshold) { // Copy P, N, PN array and attributes array positiveExamples2 = copyExamplesArray(positiveExamples); negativeExamples2 = copyExamplesArray(negativeExamples); pn_array2 = copyDouble2Darray(pn_array); attributes2 = copyDouble2Darray(attributes); // It is possible, if no attributes with gain above the // specified minimum are discovered, that the example positive // data set P will never be reduced to null --- therefore // tesy for this situation. if (noValidGainsinPNarray()) break; // Proceed with generation process prmGeneration(null,consequent); } } } /* PRM GENERATION */ /** Uses the PRM algorithm to generate a CAR for the given class by examining gain for each available attribute. @param antecedent the rule antecedent sofar. @param consequent the given class. */ private void prmGeneration(short[] antecedent, short[] consequent) { // Determine total weighting of previous example data sets. double oldPosWeigth = (double) getWeightOfExamples(positiveExamples2); double oldNegWeigth = (double) getWeightOfExamples(negativeExamples2); // Loop through attributes array and determine gain if added to rule, // gains placed in second row of attribute array. calculateGains(oldPosWeigth,oldNegWeigth); // Get index for attribute with best gain int indexOfBestAttribute = getBestGain(); // Inheritted method double bestGain = attributes2[indexOfBestAttribute][1]; // If best gain less than minimum, stop. if (bestGain <= MIN_BEST_GAIN) { foundRule(antecedent,consequent); return; } // Confirm by assigning to antecedent and indicating, in attributes // array, that selected attribute is no longer available antecedent = reallocInsert(antecedent,(short) indexOfBestAttribute); attributes2[indexOfBestAttribute][0]=1.0; // Revise example datasets by removing examples that do not satisfy // the current rule antecedent positiveExamples2=removeExDoNotSatRule(0,antecedent,positiveExamples2); negativeExamples2=removeExDoNotSatRule(1,antecedent,negativeExamples2); // If the negative example rule set is empty the total weighting for // the set will be 0.0 and thus any gain calculated for any attribute // will also be 0.0, therefore in the event of an empty negative example // set we may as well stop here rather than recur and calculate a set // of gains which will not be above the threshold! if (negativeExamples2==null) { foundRule(antecedent,consequent); return; } // Repeat prmGeneration(antecedent,consequent); } /* FOUND CLASSIFICATION RULE */ /** Inserts discovered rule into list of rules and revise positive examples array (not the copy of this array) and the PN array (again not the copy of this array).

Note the generic rule array also has field for support and confidence that are not used here. @param antecedent the antecedent of the discovered rule. @param consequent the consequent of the discovered rule. */ protected void foundRule(short[] antecedent, short[] consequent) { // It is possible, if we have no negative examples (N={}), to // produce a rule with an empty antecedent --- such a rule should // not be included in the rule list. if (antecedent==null) return; // Insert discovered rule into rule list double laplace = currentRlist.getLaplaceAccuracy(antecedent, consequent[0]); currentRlist.insertRuleintoRulelist(antecedent,consequent,laplace); // Revise positive examples revisePosExDoSatRule(antecedent,positiveExamples); } /* ------------------------------------------------------------ */ /* */ /* GAIN CALCULATIONS */ /* */ /* ------------------------------------------------------------ */ /* CALCULATE GAINS */ /** Loops through available attributes and determine gain for each if attribute added to rule.

Attribute is unavailable (i.e. has already been used in rule) if first element for attribute set to '1.0' @param oldPosWeigth total weighting represented by previous positive examples. @param oldNegWeigth total weighting represented by previous negative examples. */ protected void calculateGains(double oldPosWeigth, double oldNegWeigth) { // Loop through attribute list for (int index=1;index P = all records which DO include the current required class, (N) all records which DO NOT include the current class. D (Data Set) = P union N. @param classification the given class of interest. */ protected void generatePosAndNegExamples(short classification) { int posIndex=0, negIndex=0; // Loop through data array (training set) and determine total number // of positive and negative records for(int index=0;index Called whenever a new attribute is added to the current rules antecedent. Creates a new example structure array to hold records that DO staisfy the new rule rather than edditing the existing example structure array. Proceeds as follows:

  1. Determine size of new example structure array.
  2. Dimension new example structure array.
  3. Populate new example structure array with records that DO sataisfy the rule, and adjust PM array accordingly.
@param flag 0=positive examples, 1=negative examples. @param antecedent the given antecedent for the rule. @param examples the given examples data set. @return revise example data set with records that do not satisfy rule removed. */ protected ExamplesStruct[] removeExDoNotSatRule(int flag, short[] antecedent, ExamplesStruct[] examples) { // Return null if input array is empty. if (examples==null) return(null); // Dimension new array int size = getNumExDoSatRule(antecedent,examples); // If new array is to be empty all weightings in the copy of the PN // array associated with this example structure array must be 0.0 if (size==0) { for (int index=1;index Called whenever a new classification rule has been discovered and added to the rule list. @param antecedent the given antecedent for the rule. @param examples the given positives example structure array. */ protected void revisePosExDoSatRule(short[] antecedent, ExamplesStruct[] posExamples) { // Loop through given array for (int index1=0;index1 It is possible, if no attributes with gain above the specified minimum are discovered, that the current totalWeight for P will never be reduced to below the threshold. @return true if no attributes, false otherwise. */ protected boolean noValidGainsinPNarray() { boolean noValidGains = true; double oldPosWeigth = getWeightOfExamples(positiveExamples2); double oldNegWeigth = getWeightOfExamples(negativeExamples2); // Loop through PN array for (int index=1;indexMIN_BEST_GAIN) noValidGains=false; } // Return return(noValidGains); } /* COMBINE PN ARRAY WEIGHTINGS */ /** Produces a new PN array made up of the positive weightings derived as a consequence of the previous generation of a rule for the current class, and the negative weightings derived from the original set of negative examples contained in the old PN array. @param pnArrayRevised the revised array containing the required positive weightings @param pnArrayOld the original PN array containing weightings for the original set of negative examples. @return the ne combined PN array. */ /*protected double[][] combinePNarrayWeightings(double[][] pnArrayRevised, double[][] pnArrayOld) { // Diemsion PNarray double[][] newPNarray = new double[pnArrayOld.length][2]; // Loop through given PN array for(int index=0;index