/* -------------------------------------------------------------------------- */ /* */ /* P A R T I A L S U P P O R T T R E E */ /* */ /* Frans Coenen */ /* */ /* Wednesday 9 January 2003 */ /* (Revised 5/7/2003, 11/10/2006, 26/3/2007) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Structure: AssocRuleMining | +-- TotalSupportTree | +-- PartialSupportTree */ /* Java packages */ import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Methods to implement the "Apriori-TFP" (Total From Partial) ARM algorithm using both the T-tree (Total support tree) and P-tree (Partial support tree data structures. @author Frans Coenen @version 11 October 2006 */ public class PartialSupportTree extends TotalSupportTree { /*------------------------------------------------------------------------*/ /* */ /* FIELDS */ /* */ /*------------------------------------------------------------------------*/ /* ------ NESTED CLASSES ------ */ /** Structurte to contain P-tree data in tabular form for improved computational efficiency when creating T-tree.
A 2-D array of these
structures is created in which to store the Ptree. */
protected class PtreeRecord {
/** Label for P-tree node. */
protected short[] pTreeNodeLabel = null;
/** Union of a pTree node label (pTreeNodeLabel) and all its
ancestor node labels. */
protected short[] pTreeItemSet = null;
/** Partial support count. */
protected int support = 0;
/** Constructor to creaste a P-tree record for inclusion in table.
@param nodeLabel the label for P-tree node.
@param itemSet the uunion of a pTree node label (nodeLabel)
and all its ancestor node labels.
@param sup the partial support count. */
protected PtreeRecord(short[] nodeLabel, short[] itemSet, int sup) {
pTreeNodeLabel = nodeLabel;
pTreeItemSet = itemSet;
support = sup;
}
}
/** Reference variable pointing to start of P-tree. */
protected PtreeNodeTop[] startPtreeRef = null;
/* ------ P-TREE DATA TABLE ----- */
/** Array of arrays data structures for P-tree table (used as a
computational efficiency measure). */
protected PtreeRecord[][] startPtreeTable = null;
/** Array for holding the number of P-tree nodes for each possible
cardinality given the number of attributes, maximum is equal to number of
columns. */
protected int[] pTreeNodesOfCardinalityN = null;
/** Array of "markers" used during the generation of the P-tree, and
contains "current index" values for each level of cardinality. */
protected int[] pTreeTableMarker = null;
/* Other fields */
/** Number of node updates (used for diagnostic purposes). */
private int numPtreeNodeUpdates = 0;
/** The maximum number of P-tree nodes that can be output to a graph. */
private int maxPtreeGraphNodes = 0;
/** Flag indicating that P-tree statistics output is desired. */
private boolean outputPtreeStatsFlag = false;
/** Flag indicating that P-tree (as text) output is desired. */
public boolean outputPtreeFlag = false;
/** Flag indicating that P-tree grpah output is desired. */
private boolean outputPtreeGraphFlag = false;
/** Flag indicating whether it is OK to delete the P-tree branch by
branch as the P-tree table is built (true by default). */
private boolean okToDeletePtreeFlag = true;
/*---------------------------------------------------------------------*/
/* */
/* CONSTRUCTORS */
/* */
/*---------------------------------------------------------------------*/
/** Constructor with command line arguments to be process.
@param args the command line arguments (array of String instances). */
public PartialSupportTree(String[] args) {
super(args);
}
/** With argument from existing instance of class AssocRuleMining.
@param armInstance the given instance of the AssocRuleMining
class. */
public PartialSupportTree(AssocRuleMining armInstance) {
super(armInstance);
}
/** Default constructor. */
public PartialSupportTree() {
}
/*-------------------------------------------------------------------*/
/* */
/* P-TREE BUILDING METHODS */
/* */
/*-------------------------------------------------------------------*/
/* CREATE P-TREE */
/** Processes data set causing each row to be added to P-Tree. */
public void createPtree() {
// Dimension top line of P-tree
startPtreeRef = new PtreeNodeTop[numOneItemSets+1];
// Dimension P-tree table
startPtreeTable = new PtreeRecord[numOneItemSets+1][];
pTreeNodesOfCardinalityN = new int[numOneItemSets+1];
pTreeTableMarker = new int[numOneItemSets+1];
// Initilalise top level of Ptree with nulls
for(int index=0;index Operates as follows:
If found leaf node create new node and stop. Otherwise:
Codes generated by call the checkitemSet function. Arguments as follows:
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param parentLength the number of elements represented by the parent node
of the current node. Used only when adding new "dummy" nodes to maintain
count of number of nodes of a given size for when the Ptree table is
generated.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param itemSet the row itemSet in the input file currently under
consideration.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes. */
protected void addToPtree(int flag, int parentLength, int itemSetLength,
PtreeNode linkRef, short[] itemSet, int topIndex,
PtreeNode oldRef) {
// No child node hanging of previous level array therefore add new
// node here.
if (linkRef == null) {
PtreeNode newRef = createPtreeNode(itemSet,itemSetLength);
addSupport2(flag,newRef,topIndex,oldRef);
}
// Otherwise process tree
else {
switch (checkItemSets(itemSet,linkRef.itemSet)) {
case 1: /* Rule 1: Same */
numPtreeNodeUpdates++;
linkRef.support++;
break;
case 2: /* Rule 2: Before and subset (parent) */
beforeAndSubset(flag,itemSetLength,linkRef,itemSet,topIndex,
oldRef);
break;
case 3: /* Rule 3: Before and not subset (elder sibling) */
beforeAndNotSubset(flag,parentLength,itemSetLength,linkRef,
itemSet,topIndex,oldRef);
break;
case 4: /* Rule 4: After and superset (child) */
afterAndSuperset(parentLength,itemSetLength,linkRef,
itemSet);
break;
case 5: /* Rule 5: After and not superset (younger sibling) */
afterAndNotSuperset(flag,parentLength,itemSetLength,linkRef,
itemSet,topIndex,oldRef);
break;
default: /* Default: Error */
}
}
}
/* BEFORE AND SUBSET */
/** Adds new node into the P-tree on a parent/child link so that the new
node is the parent of the existing child branch and the child of the
previous "parent"; also checks if any siblings need to be "moved up".
Possibilities:
First checks for leading substring with existing node;
if found and leading substring is not same as current parent creates a new
P-tree node for this substring and then adds in the new node. Possibilities:
If no more child nodes add new node to "current" node as child,
else carry on down the tree with flag set to 1 to indicate we are following
a child branch. Possibilities:
Possible actions:
Also tests for leading
substring. If found and this is not equal to an existing parent itemSet
method causes a dummy node to represent the substring to be inserted (call
to addSupport2). Possibilities:
Possible actions:
The flag argument indicates which type of branch is currently under
consideration: 0 = root, 1 = child, 2 = sibling.
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param newRef the reference (pointer) to newly created parent indicating
current laction in the P-tree.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes. */
private void addSupport2(int flag, PtreeNode newRef, int topIndex,
PtreeNode oldRef) {
// Add node
switch (flag) {
case 0:
startPtreeRef[topIndex].childRef = newRef;
break;
case 1:
oldRef.childRef = newRef;
break;
case 2:
oldRef.siblingRef = newRef;
break;
default:
System.out.println("ERROR: Unidentified flag in addSupport\n");
}
}
/* ------ CHECK SIBLING BRANCH ------ */
/** Checks sibling branch to determine whether the siblings are all
supersets of the parent and readjusts P-tree accordingly. Possibilities:
Proceed as
follows.
Overides method in
TotalSupportTree class (operates using a P-tree tavle rather than the
raw dataset). */
protected void createTtreeTopLevel2() {
numLevelsInTtree = 1;
// Step through Ptree table
for(int index=1;index
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param currentItemSet the row itemSet in the input file currently under
consideration.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes.*/
private void beforeAndSubset(int flag, int itemSetLength, PtreeNode linkRef,
short[] currentitemSet, int topIndex, PtreeNode oldRef) {
// Create new node with support of current node added in;
PtreeNode newRef = createPtreeNode(currentitemSet,itemSetLength);
newRef.support = newRef.support+linkRef.support;
numPtreeNodeUpdates++;
// Link in existing branch
newRef.childRef = linkRef;
// Connect new node into tree structure and adjust existing current
// node itemSet so that it does not include the new parent node itemSet
addSupport2(flag,newRef,topIndex,oldRef);
linkRef.itemSet = realloc4(linkRef.itemSet,currentitemSet);
// Check whether any siblings of the existing node need to be
// "moved up" to become a sibling of the new node
checkSiblingBranch(linkRef,currentitemSet,newRef);
}
/* BEFORE AND NOT SUBSET */
/** Insets node into P-tree where new itemset is an elder sibling of the
"current" node.
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param parentLength the number of elements represented by the parent node
of the current node. Used only when adding new "dummy" nodes to maintain
count of number of nodes of a given size for when the Ptree table is
generated.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param currentItemSet the row itemSet in the input file currently under
consideration.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes. */
private void beforeAndNotSubset(int flag, int parentLength,
int itemSetLength, PtreeNode linkRef, short[] currentItemSet,
int topIndex, PtreeNode oldRef) {
// Find leading common ellements in row and sibling itemSets if any
short[] subsetItemSet = checkForLeadingSubString(currentItemSet,
linkRef.itemSet);
// If leading common ellements exists create new node representing
// common elements and add current itemSet as child of this new node.
// Otherwise add new itemSet as elder sibling.
if (subsetItemSet != null) { // Leading substring exists
// Create new parent representing subset
PtreeNode newParentRef = createPtreeNode(subsetItemSet,
subsetItemSet.length+parentLength-1);
// Add support made up of existing support + 1 for current row
newParentRef.support = linkRef.support+1;
// Insert new parent node into tree
addSupport2(flag,newParentRef,topIndex,oldRef);
// Remove leading substring from row itemSet
currentItemSet = realloc4(currentItemSet,subsetItemSet);
// Attach as child and add on sibling,
newParentRef.childRef = createPtreeNode(currentItemSet,
itemSetLength);
newParentRef.childRef.siblingRef = linkRef;
// Check whether any siblings need to be "moved up"
checkSiblingBranch(newParentRef.childRef,subsetItemSet,newParentRef);
}
else {
// Create new node
PtreeNode newSiblingRef =
createPtreeNode(currentItemSet,itemSetLength);
// Attach existing node as younger sibling
newSiblingRef.siblingRef = linkRef;
// Insert into tree
addSupport2(flag,newSiblingRef,topIndex,oldRef);
}
}
/* AFTER AND SUPERSET */
/** Insets node into P-tree where new itemset is an child of the "current"
node.
@param parentLength the number of elements represented by the parent node
of the current node. Used only when adding new "dummy" nodes to maintain
count of number of nodes of a given size for when the Ptree table is
generated.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param currentItemSet the row itemSet in the input file currently under
consideration. */
private void afterAndSuperset(int parentLength, int itemSetLength,
PtreeNode linkRef, short[] currentItemSet) {
numPtreeNodeUpdates++;
linkRef.support = linkRef.support+1; // Increment support
// End of child branch
if (linkRef.childRef == null) {
// Remove existing parent itemSet from currentItemSet
PtreeNode newRef = createPtreeNode(realloc4(currentItemSet,
linkRef.itemSet),itemSetLength);
// Add to existing node as child
linkRef.childRef = newRef;
}
// More children, remove existing current itemSet from row itemSet and
// continue down child branch
else addToPtree(1,parentLength+linkRef.itemSet.length,itemSetLength,
linkRef.childRef,realloc4(currentItemSet,
linkRef.itemSet),0,linkRef);
}
/* AFTER AND NOT SUPERSET */
/** Commeences process of inserting node into P-tree where new itemset is
a younger sibling of the "current" node.
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param parentLength the number of elements represented by the parent node
of the current node. Used only when adding new "dummy" nodes to maintain
count of number of nodes of a given size for when the Ptree table is
generated.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param currentItemSet the row itemSet in the input file currently under
consideration.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes. */
private void afterAndNotSuperset(int flag, int parentLength,
int itemSetLength, PtreeNode linkRef, short[] currentItemSet,
int topIndex, PtreeNode oldRef) {
// Test if end of sibling branch, if not continue
if (linkRef.siblingRef == null)
afterAndNotSuperset1(flag,parentLength,itemSetLength,
linkRef,currentItemSet,topIndex,oldRef);
// Not end of sibling branch
else afterAndNotSuperset2(flag,parentLength,itemSetLength,linkRef,
currentItemSet,topIndex,oldRef);
}
/* AFTER AND NOT SUPERSET 1 */
/** Inserts node into P-tree where new itemset is a younger sibling of the
"current" node and there are no more younger siblings on current existing
node therefore new node added as sibling.
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param parentLength the number of elements represented by the parent node
of the current node. Used only when adding new "dummy" nodes to maintain
count of number of nodes of a given size for when the Ptree table is
generated.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param currentItemSet the row itemSet in the input file currently under
consideration.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes. */
private void afterAndNotSuperset1(int flag, int parentLength,
int itemSetLength, PtreeNode linkRef, short[] currentItemSet,
int topIndex, PtreeNode oldRef) {
// Find leading common ellements in row and sibling itemSets if any
short[] subsetItemSet = checkForLeadingSubString(currentItemSet,
linkRef.itemSet);
// If leading common ellements exists create new node representing
// common elements and add current itemSet as child of this new node.
// Otherwise add new itemSet as elder sibling.
if (subsetItemSet != null) {
// Create new parent representing subset
PtreeNode newParent = createPtreeNode(subsetItemSet,
subsetItemSet.length+parentLength-1);
// Add support made up of existing support + 1 for current row
newParent.support = linkRef.support+1;
// Insert new parent node into tree
addSupport2(flag,newParent,topIndex,oldRef);
// Remove leading substring from current existing node
linkRef.itemSet = realloc4(linkRef.itemSet,subsetItemSet);
// Attach existing branch as child of new parent and new node
// as sibling
newParent.childRef = linkRef;
linkRef.siblingRef = createPtreeNode(realloc4(currentItemSet,
subsetItemSet),itemSetLength);
}
// No leading substring
else linkRef.siblingRef = createPtreeNode(currentItemSet,
itemSetLength);
}
/* AFTER AND NOT SUPERSET 2 */
/** Inserts node into P-tree where new itemset is a younger sibling of the
"current" node and there are more younger sibling on current existing
node.
Possibility:
@param flag the type of branch currently under consideration:
code 0 = root, 1 = child, 2 = sibling.
@param parentLength the number of elements represented by the parent node
of the current node. Used only when adding new "dummy" nodes to maintain
count of number of nodes of a given size for when the Ptree table is
generated.
@param itemSetLength the number of elements in the current itemSet. Used
only when adding new nodes to maintain count of number of nodes of a given
size for when the Ptree table is generated.
@param linkRef the reference (pointer) to current location in the P-tree.
@param currentItemSet the row itemSet in the input file currently under
consideration.
@param topIndex the index of the element in the array marking the top level
of the P-tree, used only when inserting new nodes hanging from this top
level otherwise ignored.
@param oldRef the reference (pointer) to the previous location in the P-tree,
used when inserting new nodes. */
private void afterAndNotSuperset2(int flag, int parentLength,
int itemSetLength, PtreeNode linkRef,
short[] currentItemSet, int topIndex, PtreeNode oldRef) {
// Find leading common ellements in row and sibling itemSets if any
short[] subsetItemSet = checkForLeadingSubString(currentItemSet,
linkRef.itemSet);
// If leading common ellements exists create new node representing
// common elements and add current itemSet as child of this new node.
// Otherwise add new itemSet as elder sibling.
if (subsetItemSet != null) {
// Create new parent representing leading common elements
PtreeNode newParentRef = createPtreeNode(subsetItemSet,
subsetItemSet.length+parentLength-1);
// Add support made up of existing support + 1 for current row
newParentRef.support = linkRef.support+1;
// Insert new parent node into tree
addSupport2(flag,newParentRef,topIndex,oldRef);
// Remove leading substring from current existing node and add
// as child of new parent node
linkRef.itemSet = realloc4(linkRef.itemSet,subsetItemSet);
newParentRef.childRef = linkRef;
// Store reference to existing branch of current existing node
// in temporary variable
PtreeNode tempRef = linkRef.siblingRef;
// Create new node representing row and add as sibling of
// current existing ref (NOTE: leading sub string will be
// removed when checking siblings (checkSiblingBranch)
linkRef.siblingRef = createPtreeNode(currentItemSet,itemSetLength);
// Now add in previous siblings
linkRef.siblingRef.siblingRef = tempRef;
// Check whether any siblings need to be "moved up"
checkSiblingBranch(newParentRef.childRef,subsetItemSet,newParentRef);
}
// Otherwise carry on along sibling branch
else addToPtree(2,parentLength,itemSetLength,linkRef.siblingRef,
currentItemSet,0,linkRef);
}
/* ------ ADD SUPPORT 2 ------ */
/** Adds new node where "before and subset" or "after and not superset".
Note: when a node is found that is not a superset of the parent we do not
need to keep on checking.
@param linkRef the reference (pointer) to the current node.
@param parentItemSet the itemset label represented by the parent.
@param newRef tghe reference (pointer) to the newly created parent node.*/
private void checkSiblingBranch(PtreeNode linkRef, short[] parentItemSet,
PtreeNode newRef) {
// Check if first node in sibling branch is a superset of parent
// itemSet. If not move the entire branch up.
if (linkRef.siblingRef != null) {
if (! isSubset(parentItemSet,linkRef.siblingRef.itemSet)) {
newRef.siblingRef = linkRef.siblingRef;
linkRef.siblingRef = null;
}
// Check rest. Branch starts of with supersets of parent itemSet
// (which are OK where they are), must find the point where this is
// no longer the case, i.e. the part of the branch that needs to be
// moved up (if any).
else {
// Remove leading substring
linkRef.siblingRef.itemSet =
realloc4(linkRef.siblingRef.itemSet,
parentItemSet);
// Set reference varibales
PtreeNode markerRef = linkRef.siblingRef;
PtreeNode localLinkRef = linkRef.siblingRef.siblingRef;
while (localLinkRef != null) {
if (! isSubset(parentItemSet,localLinkRef.itemSet)) {
newRef.siblingRef = localLinkRef;
markerRef.siblingRef = null;
break;
}
else {
localLinkRef.siblingRef.itemSet =
realloc4(localLinkRef.siblingRef.itemSet,
parentItemSet);
markerRef = localLinkRef;
localLinkRef = localLinkRef.siblingRef;
}
}
}
}
}
/* CREAT P-TREE NODE */
/** Creates a P-tree node (other than a top level node).
@param itemSet the itemset to be stored at the node.
@param level the cardinality (length) of the item set to be stored in
the node. */
protected PtreeNode createPtreeNode(short[] itemSet, int level) {
pTreeNodesOfCardinalityN[level]++;
return(new PtreeNode(itemSet));
}
/* GET START OF P-TREE */
/** Gets reference to start of P-tree. */
public PtreeNodeTop[] getStartOfPtree() {
return(startPtreeRef);
}
/* GET NUMBER P-TREE NODES */
/** Gets number of nodes in P-tree. */
public int getNumPtreeNodes() {
return(calculateNumNodes(startPtreeRef));
}
/*----------------------------------------------------------------------- */
/* */
/* P-TREE TABLE */
/* */
/*----------------------------------------------------------------------- */
/* CREATE P-TREE TABLE */
/** Creates P-tree table starting with top level in P-tree.
*/
public void createPtreeTable() {
// Set up array of arrays
for (int index=1;index