/* -------------------------------------------------------------------------- */ /* */ /* P A R T I A L S U P P O R T T R E E */ /* */ /* Frans Coenen */ /* */ /* Wednesday 9 January 2003 */ /* (Revised 5/7/2003) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Structure: AssocRuleMining | +-- TotalSupportTree | +-- PartialSupportTree */ /* Java packages */ import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Methods to implement the "Apriori-TFP" (Total From Partial) ARM algorithm using both the T-tree (Total support tree) and P-tree (Partial support tree data structures. @author Frans Coenen @version 5 July 2003 */ public class PartialSupportTree extends TotalSupportTree { /*------------------------------------------------------------------------*/ /* */ /* FIELDS */ /* */ /*------------------------------------------------------------------------*/ /* ------ NESTED CLASSES ------ */ /** Structurte to contain P-tree data in tabular form for improved computational efficiency when creating T-tree.

A 2-D array of these structures is created in which to store the Ptree. */ protected class PtreeRecord { /** Label for P-tree node. */ private short[] pTreeNodeLabel = null; /** Uunion of a pTree node label (pTreeNodeLabel) and all its ancestor node labels. */ private short[] pTreeItemSet = null; /** Partial support count. */ private int support = 0; /** Creaste P-tree record for inclusion in table. @param nodeLabel the label for P-tree node. @param itemSet the uunion of a pTree node label (nodeLabel) and all its ancestor node labels. @param sup the partial support count. */ private PtreeRecord(short[] nodeLabel, short[] itemSet, int sup) { pTreeNodeLabel = nodeLabel; pTreeItemSet = itemSet; support = sup; } } /* Array data structures for P tree */ /** Null array describing an "emty" itemset. */ private short[] zeroitemSet = null; /** Reference variable pointing to start of P-tree. */ private PtreeNodeTop[] startPtreeRef = null; /* ------ P-TREE DATA TABLE ----- */ /** Array of arrays data structures for P-tree table (used as a computational efficiency measure). */ protected PtreeRecord[][] startPtreeTable = null; /** Array for holding the number of P-tree nodes for each possible cardinality given the number of attributes, maximum is equal to number of columns. */ private int[] pTreeNodesOfCardinalityN = null; /** Array of "markers" used during the generation of the P-tree, and contains "current index" values for each level of cardinality. */ private int[] pTreeTableMarker = null; /* Other fields */ /** Number of node updates (used for diagnostic purposes). */ private int numberOfNodeUpdates = 0; /*---------------------------------------------------------------------*/ /* */ /* CONSTRUCTORS */ /* */ /*---------------------------------------------------------------------*/ /** Processes command line arguments.

The numOneItemSets is incremented by 1 so that indexes match the column numbers */ public PartialSupportTree(String[] args) { super(args); } /*-------------------------------------------------------------------*/ /* */ /* TREE BUILDING METHODS */ /* */ /*-------------------------------------------------------------------*/ /* CREATE P-TREE */ /** Processes data set causing each row to be added to P-Tree. */ public void createPtree() { System.out.println("GENERATING P-TREE\n------------------"); // Dimension top line of P-tree startPtreeRef = new PtreeNodeTop[numOneItemSets+1]; // Dimension P-tree table startPtreeTable = new PtreeRecord[numOneItemSets+1][]; pTreeNodesOfCardinalityN = new int[numOneItemSets+1]; pTreeTableMarker = new int[numOneItemSets+1]; // Initilalise top level of Ptree with nulls for(int index=0;index Note that the top level is an array. @param itemset the given item set. */ private void addToPtreeTopLevel(short[] itemSet) { int index = itemSet[0]; // Calculate index int itemSetLength = itemSet.length; // If single attibute itemSet create or update element, otherwise // create or update element and proceed down child branch (flag = 1) if (itemSetLength == 1) { // Top level if (startPtreeRef[index] == null) { startPtreeRef[index] = new PtreeNodeTop(); pTreeNodesOfCardinalityN[1]++; } else startPtreeRef[index].support = startPtreeRef[index].support+1; numberOfNodeUpdates++; } // itemSet length greater than 1 therefore proceed down rest of tree else { // If no top level node create one. if (startPtreeRef[index] == null) { startPtreeRef[index] = new PtreeNodeTop(); pTreeNodesOfCardinalityN[1]++; } else startPtreeRef[index].support = startPtreeRef[index].support+1; numberOfNodeUpdates++; // Descend from top level node addToPtree(0,2,itemSetLength,startPtreeRef[index].childRef, realloc3(itemSet),index,null); } } /* ADD TO P-TREE */ /** Inserts given itemset into P-tree.

Operates as follows: If found leaf node create new node and stop. Otherwise:

Codes generated by call the checkitemSet function. Arguments as follows: @param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param parentLength the number of elements represented by the parent node of the current node. Used only when adding new "dummy" nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param itemSet the row itemSet in the input file currently under consideration. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes. */ public void addToPtree(int flag, int parentLength, int itemSetLength, PtreeNode linkRef, short[] itemSet, int topIndex, PtreeNode oldRef) { // No child node hanging of previous level array therefore add new // node here. if (linkRef == null) { PtreeNode newRef = createPtreeNode(itemSet,itemSetLength); addSupport2(flag,newRef,topIndex,oldRef); } // Otherwise process tree else { switch (checkItemSets(itemSet,linkRef.itemSet)) { case 1: /* Rule 1: Same */ numberOfNodeUpdates++; linkRef.support++; break; case 2: /* Rule 2: Before and subset (parent) */ beforeAndSubset(flag,itemSetLength,linkRef,itemSet,topIndex, oldRef); break; case 3: /* Rule 3: Before and not subset (elder sibling) */ beforeAndNotSubset(flag,parentLength,itemSetLength,linkRef, itemSet,topIndex,oldRef); break; case 4: /* Rule 4: After and superset (child) */ afterAndSuperset(parentLength,itemSetLength,linkRef, itemSet); break; case 5: /* Rule 5: After and not superset (younger sibling) */ afterAndNotSuperset(flag,parentLength,itemSetLength,linkRef, itemSet,topIndex,oldRef); break; default: /* Default: Error */ } } } /* BEFORE AND SUBSET */ /** Adds new node into the P-tree on a parent/child link so that the new node is the parent of the existing child branch and the child of the previous "parent".

Possibilities:

  1. Connect to top level node with no siblings moved up ({1 2 3} {1 2})
  2. Connect to top level node with siblings moved up ({1 2 3} {1 4} {1 2})
  3. Connect to child ref with no siblings moved up ({1 2 3 4} {1 2} {1 2 3})
  4. Connect to child ref with siblings moved up ({1 2 3 4} {1 2 4 5} {1 2} {1 2 3})
  5. connect to sibling ref with no siblings moved up ({1 2} {1 3 4} {1 3})
  6. connect to sibling ref with siblings moved up ({1 2} {1 3 4} (1 4) {1 3})
@param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param currentItemSet the row itemSet in the input file currently under consideration. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes.*/ private void beforeAndSubset(int flag, int itemSetLength, PtreeNode linkRef, short[] currentitemSet, int topIndex, PtreeNode oldRef) { // Create new node with support of current node added in; PtreeNode newRef = createPtreeNode(currentitemSet,itemSetLength); newRef.support = newRef.support+linkRef.support; numberOfNodeUpdates++; // Link in existing branch newRef.childRef = linkRef; // Connect new node into tree structure and adjust existing current // node itemSet so that it does not include the new parent node itemSet addSupport2(flag,newRef,topIndex,oldRef); linkRef.itemSet = realloc4(linkRef.itemSet,currentitemSet); // Check whether any siblings of the existing node need to be // "moved up" to become a sibling of the new node newRef.siblingRef=linkRef.siblingRef; linkRef.siblingRef=null; } /* BEFORE AND NOT SUBSET */ /** Insets node into P-tree where new itemset is an elder sibling of the "current" node.

First checks for leading substring with existing node; if found and leading substring is not same as current parent creates a new P-tree node for this substring and then adds in the new node. Possibilities:

  1. Connect to top level node with no common leading substring with current node ({1 3} {1 2}).
  2. Connect to top level node with common leading substring with current node and no siblings moved up ({1 2 4} {1 2 3}).
  3. Connect to top level node with common leading substring with current node and siblings moved up ({1 2 4} (1 3} {1 2 3}).
  4. Connect to child ref with no common leading substring with current node ({1 2} {1 2 4} {1 2 3}).
  5. Connect to child ref with common leading substring with current node and no siblings moved up ({1 2} {1 2 4 5 7} {1 2 4 5 6}).
  6. Connect to child ref with common leading substring with current node and siblings moved up ({1 2} {1 2 4 5 7} {1 3} {1 2 4 5 6}).
  7. Connect to sibling ref with no common leading substring with current node ({1 2} {1 4} {1 3}).
  8. Connect to sibling ref with common leading substring with current node and no siblings moved up ({1 2} {1 4 5 7} {1 4 5 6}).
  9. Connect to sibling ref with common leading substring with current node and siblings moved up ({1 2} {1 4 5 7} {1 6} {1 4 5 6}).
@param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param parentLength the number of elements represented by the parent node of the current node. Used only when adding new "dummy" nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param currentItemSet the row itemSet in the input file currently under consideration. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes. */ private void beforeAndNotSubset(int flag, int parentLength, int itemSetLength, PtreeNode linkRef, short[] currentItemSet, int topIndex, PtreeNode oldRef) { // Find leading common ellements in row and sibling itemSets if any short[] leadingSubString = checkForLeadingSubString(currentItemSet, linkRef.itemSet); // If leading common ellements exists create new node representing // common elements and add current itemSet as child of this new node. // Otherwise add new itemSet as elder sibling. if (leadingSubString != null) { // Leading substring exists // Create new dummy parent representing subset PtreeNode newDummyParentRef = createPtreeNode(leadingSubString, leadingSubString.length+parentLength-1); // Add support made up of existing support + 1 for current row newDummyParentRef.support = linkRef.support+1; // Insert new parent node into tree addSupport2(flag,newDummyParentRef,topIndex,oldRef); // Remove leading substring from row itemSet and attach as child // of dummy parent node currentItemSet = realloc4(currentItemSet,leadingSubString); newDummyParentRef.childRef = createPtreeNode(currentItemSet, itemSetLength); // Remove leading substring from current node itemSet and place // current node as sibling of new node linkRef.itemSet=realloc4(linkRef.itemSet,leadingSubString); newDummyParentRef.childRef.siblingRef = linkRef; // Check whether any siblings need to be "moved up" newDummyParentRef.siblingRef=linkRef.siblingRef; linkRef.siblingRef=null; } else { // Create new node PtreeNode newSiblingRef = createPtreeNode(currentItemSet,itemSetLength); // Attach existing node as younger sibling newSiblingRef.siblingRef = linkRef; // Insert into tree addSupport2(flag,newSiblingRef,topIndex,oldRef); } } /* AFTER AND SUPERSET */ /** Insets node into P-tree where new itemset is an child of the "current" node.

If no more child nodes add new node to "current" node as child, else carry on down the tree with flag set to 1 to indicate we are following a child branch. Possibilities:

  1. Add to top level node ({1 2}).
  2. Add to child ref ({1 2} {1 2 3}).
@param parentLength the number of elements represented by the parent node of the current node. Used only when adding new "dummy" nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param currentItemSet the row itemSet in the input file currently under consideration. */ private void afterAndSuperset(int parentLength, int itemSetLength, PtreeNode linkRef, short[] currentItemSet) { numberOfNodeUpdates++; linkRef.support = linkRef.support+1; // Increment support // End of child branch if (linkRef.childRef == null) { // Remove existing parent itemSet from currentItemSet PtreeNode newRef = createPtreeNode(realloc4(currentItemSet, linkRef.itemSet),itemSetLength); // Add to existing node as child linkRef.childRef = newRef; } // More children, remove existing current itemSet from row itemSet and // continue down child branch else addToPtree(1,parentLength+linkRef.itemSet.length,itemSetLength, linkRef.childRef,realloc4(currentItemSet, linkRef.itemSet),0,linkRef); } /* AFTER AND NOT SUPERSET */ /** Commeences process of inserting node into P-tree where new itemset is a younger sibling of the "current" node.

Possible actions:

  1. There are NO more sibling nodes (call afterAndNotSuperset1).
  2. There are more sibling nodes (call afterAndNotSuperset2).
@param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param parentLength the number of elements represented by the parent node of the current node. Used only when adding new "dummy" nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param currentItemSet the row itemSet in the input file currently under consideration. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes. */ private void afterAndNotSuperset(int flag, int parentLength, int itemSetLength, PtreeNode linkRef, short[] currentItemSet, int topIndex, PtreeNode oldRef) { // Test if end of sibling branch, if not continue if (linkRef.siblingRef == null) afterAndNotSuperset1(flag,parentLength,itemSetLength, linkRef,currentItemSet,topIndex,oldRef); // Not end of sibling branch else afterAndNotSuperset2(flag,parentLength,itemSetLength,linkRef, currentItemSet,topIndex,oldRef); } /* AFTER AND NOT SUPERSET 1 */ /** Inserts node into P-tree where new itemset is a younger sibling of the "current" node and there are no more younger siblings on current existing node therefore new node added as sibling.

Also tests for leading substring. If found and this is not equal to an existing parent itemSet method causes a dummy node to represent the substring to be inserted (call to addSupport2). Possibilities:

  1. Add to siblingRef with no common leading substring ({1 2} {1 3}).
  2. Add to siblingRef with common leading substring ({1 2} {1 3 5} {1 3 4}).
@param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param parentLength the number of elements represented by the parent node of the current node. Used only when adding new "dummy" nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param currentItemSet the row itemSet in the input file currently under consideration. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes. */ private void afterAndNotSuperset1(int flag, int parentLength, int itemSetLength, PtreeNode linkRef, short[] currentItemSet, int topIndex, PtreeNode oldRef) { // Find leading common ellements in row and sibling itemSets if any short[] leadingSubString = checkForLeadingSubString(currentItemSet, linkRef.itemSet); // If leading common ellements exists create new node representing // common elements and add current itemSet as child of this new node. // Otherwise add new itemSet as elder sibling. if (leadingSubString != null) { // Create new parent representing subset PtreeNode newParent = createPtreeNode(leadingSubString, leadingSubString.length+parentLength-1); // Add support made up of existing support + 1 for current row newParent.support = linkRef.support+1; // Insert new parent node into tree addSupport2(flag,newParent,topIndex,oldRef); // Remove leading substring from current existing node linkRef.itemSet = realloc4(linkRef.itemSet,leadingSubString); // Attach existing branch as child of new parent and new node // as sibling newParent.childRef = linkRef; linkRef.siblingRef = createPtreeNode(realloc4(currentItemSet, leadingSubString),itemSetLength); } // No leading substring else linkRef.siblingRef = createPtreeNode(currentItemSet, itemSetLength); } /* AFTER AND NOT SUPERSET 2 */ /** Inserts node into P-tree where new itemset is a younger sibling of the "current" node and there are more younger sibling on current existing node.

Possible actions:

  1. If there are more sibling nodes and the current row itemSet shares a leading substring with the current P-tree node and this is not equal to an existing parent itemSet then add a dummy node to represent the substring (call addSupport2). Then add new node.
  2. If more sibling nodes and no shared leading substring continue down sibling branch.
Possibility:
  1. Add in dummy node ({1 2 3} {1 3 5} {1 6} {1 3 6}).
@param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param parentLength the number of elements represented by the parent node of the current node. Used only when adding new "dummy" nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param itemSetLength the number of elements in the current itemSet. Used only when adding new nodes to maintain count of number of nodes of a given size for when the Ptree table is generated. @param linkRef the reference (pointer) to current location in the P-tree. @param currentItemSet the row itemSet in the input file currently under consideration. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes. */ private void afterAndNotSuperset2(int flag, int parentLength, int itemSetLength, PtreeNode linkRef, short[] currentItemSet, int topIndex, PtreeNode oldRef) { // Find leading common ellements in row and sibling itemSets if any short[] leadingSubString = checkForLeadingSubString(currentItemSet, linkRef.itemSet); // If leading common ellements exists create new node representing // common elements and add current itemSet as child of this new node. // Otherwise add new itemSet as elder sibling. if (leadingSubString != null) { // Create new "dummy" parent representing leading common elements PtreeNode newDummyParentRef = createPtreeNode(leadingSubString, leadingSubString.length+parentLength-1); // Add support made up of existing support + 1 for current row newDummyParentRef.support = linkRef.support+1; // Insert new parent node into tree addSupport2(flag,newDummyParentRef,topIndex,oldRef); // Remove leading substring from current existing node and add // as child of new parent node linkRef.itemSet = realloc4(linkRef.itemSet,leadingSubString); newDummyParentRef.childRef = linkRef; // Store reference to existing branch of current existing node // in temporary variable PtreeNode tempRef = linkRef.siblingRef; // Removeleading substring from current record currentItemSet = realloc4(currentItemSet,leadingSubString); // Create new node representing row and add as sibling of // current node linkRef.siblingRef = createPtreeNode(currentItemSet,itemSetLength); //Now add in previous siblings newDummyParentRef.siblingRef=tempRef; } // Otherwise carry on along sibling branch else addToPtree(2,parentLength,itemSetLength,linkRef.siblingRef, currentItemSet,0,linkRef); } /* ------ ADD SUPPORT 2 ------ */ /** Adds new node where "before and subset" or "after and not superset".

The flag argument indicates which type of branch is currently under consideration: 0 = root, 1 = child, 2 = sibling. @param flag the type of branch currently under consideration: code 0 = root, 1 = child, 2 = sibling. @param newRef the reference (pointer) to newly created parent indicating current laction in the P-tree. @param topIndex the index of the element in the array marking the top level of the P-tree, used only when inserting new nodes hanging from this top level otherwise ignored. @param oldRef the reference (pointer) to the previous location in the P-tree, used when inserting new nodes. */ private void addSupport2(int flag, PtreeNode newRef, int topIndex, PtreeNode oldRef) { // Add node switch (flag) { case 0: startPtreeRef[topIndex].childRef = newRef; break; case 1: oldRef.childRef = newRef; break; case 2: oldRef.siblingRef = newRef; break; default: System.out.println("ERROR: Unidentified flag in addSupport\n"); } } /* CREAT P-TREE NODE */ /** Creates a P-tree node (other than a top level node). @param newItemSet the itemset to be stored at the node. @param level the cardinality (length) of the item set to be stored in the node. */ private PtreeNode createPtreeNode(short[] itemSet, int level) { pTreeNodesOfCardinalityN[level]++; return(new PtreeNode(itemSet)); } /* GET START OF P-TREE */ /** Gets reference to start of P-tree. */ public PtreeNodeTop[] getStartOfPtree() { return(startPtreeRef); } /* GET NUMBER P-TREE NODES */ /** Gets number of nodes in P-tree. */ public int getNumPtreeNodes() { return(calculateNumNodes(startPtreeRef)); } /*----------------------------------------------------------------------- */ /* */ /* P-TREE TABLE */ /* */ /*----------------------------------------------------------------------- */ /* CREATE P-TREE TABLE */ /** Creates P-tree table starting with top level in P-tree.

Proceed as follows.

  1. Create an array of arrays.
  2. Add top level of Ptree.
  3. Add remaining nodes in sub-branches.
*/ public void createPtreeTable() { // Set up array of arrays for (int index=1;index