/* -------------------------------------------------------------------------- */
/* */
/* PHRASE BINARY TREE */
/* */
/* Frans Coenen */
/* */
/* Wednesday 21 December 2005 */
/* (Revised 3/2/2006) */
/* */
/* Department of Computer Science */
/* The University of Liverpool */
/* */
/* -------------------------------------------------------------------------- */
//package lucsKDD_ARM;
/** Contains methods to generate a binary tree to describe all the phrases
representesd by a set of documents. Generated by processing the training
document base using methods in TrainingSetDocumentBase commencing with
the method generatePhraseBinTree. Also conatins the methods to recast
the input training set in terms of a set of attributes representing phrases
and to output the training set.
@author Frans Coenen
@version 21 December 2005. */
public class PhraseBinTree {
/* ------------------------------- */
/* */
/* FIELDS */
/* */
/* ------------------------------- */
/** Reference to start of phrase bin tree. */
private PhraseBinTreeNode phraseBinTreeStart = null;
/** 2D array in which to store traing set of binary valued ARM data. */
private short[][] trainingDataArray = null;
/* ------------------------------------ */
/* */
/* CONSTRUCTORS */
/* */
/* ------------------------------------ */
// Defulat constructor only
/* ------------------------------------ */
/* */
/* METHOS */
/* */
/* ------------------------------------ */
/* ---------------------------------------------------- */
/* METHODS TO GENERATE BIN TREE */
/* ---------------------------------------------------- */
/** Starts recursive process of adding phrase to bin tree.
@param phrase to be added.
@param docNum the ID number of the current document.
@param fstSigWordInPhrase the first significant word in the phrase.
@param indexOfFstSigWordInPhrase the index of the first significant word in
the phrase. */
public PhraseBinTreeNode addToPhraseBinTree(String[] phrase, int docNum,
String fstSigWordInPhrase, int indexOfFstSigWordInPhrase) {
// Check for empty tree.
if (phraseBinTreeStart==null) {
phraseBinTreeStart = new PhraseBinTreeNode(phrase,docNum,
fstSigWordInPhrase,indexOfFstSigWordInPhrase);
return(phraseBinTreeStart);
}
else return(addToPhraseBinTree(phraseBinTreeStart,phrase,docNum,
fstSigWordInPhrase,indexOfFstSigWordInPhrase));
}
/** Continues process of adding phrase to bin tree.
@param node the current location in the phrase bin tree.
@param phrase to be added.
@param docNum the ID number of the current document.
@param sigWord the first significant word in the phrase.
@param indexSigWord the index of the first significant word in the
phrase. */
private PhraseBinTreeNode addToPhraseBinTree(PhraseBinTreeNode currentNode,
String[] phrase, int docNum, String sigWord,
int indexSigWord) {
int selector = comparePhrase(phrase,currentNode.phrase);
// Comparison: If selector equals 0 found identical phrase
if (selector == 0) {
if (docNumberDoesNotExist(currentNode,docNum))
addToDocNumbers(currentNode,docNum);
return(currentNode);
}
// Left hand branch
else if (selector < 0) {
if (currentNode.beforeBranch==null) {
PhraseBinTreeNode newPhraseBinTreeNode =
new PhraseBinTreeNode(phrase,docNum,
sigWord,indexSigWord);
currentNode.beforeBranch = newPhraseBinTreeNode;
return(newPhraseBinTreeNode);
}
else return(addToPhraseBinTree(currentNode.beforeBranch,phrase,
docNum,sigWord,indexSigWord));
}
// Rught hand branch
else {
if (currentNode.afterBranch==null) {
PhraseBinTreeNode newPhraseBinTreeNode =
new PhraseBinTreeNode(phrase,docNum,
sigWord,indexSigWord);
currentNode.afterBranch = newPhraseBinTreeNode;
return(newPhraseBinTreeNode);
}
else return(addToPhraseBinTree(currentNode.afterBranch,phrase,
docNum,sigWord,indexSigWord));
}
}
/** Compare phrases to determine relative lexicographical location.
@param phrase the given phrase.
@param newPhrase the phrase representwed by the current node.
@param return 0 if the same, -1 if phrase is lexicographically before the
node phrase and 1 otherwise. */
public static int comparePhrase(String[] phrase, String[] nodePhrase) {
int same = 0, before = -1, after = 1;
for (int index=0;index0) return(after);
}
// If phrases the same length then they are the same
if (phrase.length==nodePhrase.length) return(same);
// Otherwise noderPhrase same as start of phrase but node phrase is
// longer.
return(before);
}
/** public void setAdds a document number to the document list for the
current word.
@param node the current node in the phrase bin tree.
@param docNum the ID number of the current document. */
private void addToDocNumbers(PhraseBinTreeNode node, int newDocNum) {
// Declare new document numbers array
int length = node.docNumbers.length+1;
int[] newDocNums = new int[length];
// Copy existing document numbers
int index=0;
for (;index Proceeds by processing phrase
bin tree node by node, each phrase bin tree node includes a list of the
documents in which the phrase represented by the node are contained. Note
that documents are numbered from 1 onwards while the data array is indexed
starting from zero therefore it is necessary to subtract 1 from the
document number to obtian the array index.
@param node the current location in the phrase bin tree.
@param attNum the current attrivute number. */
private short generateDataArray(PhraseBinTreeNode node, short attNum) {
if (node != null) {
// Left Branch
attNum = generateDataArray(node.beforeBranch,attNum);
// Process node
addToDataArray(node.docNumbers,attNum);
// Right Branch
attNum = generateDataArray(node.afterBranch,(short) (attNum+1));
}
// End
return(attNum);
}
/** Adds phrase attibute to appropriate locations in data array. Note
that documents are numbered from 1 onwards while the data array is indexed
starting from zero therefore it is necessary to subtract 1 from the
document number to obtian the array index.
@param docNumbers the list of documents in which the attribute appears.
@param attNum the current attrivute number. */
private void addToDataArray(int[] docNumbers, short attNum) {
// Process document numbers array
for(int docNumIndex=0;docNumIndex
Proceeds in a recursive manner by processing the phrase bin tree.
@param testDocBase the given instance of the class TestSetDocBase.
@param node the current location in the phrase bin tree.
@param docNum the current document number.
@param attNum the current attribute number. */
private short generateTestDataArray(TestSetDocumentBase testDocBase,
PhraseBinTreeNode node, int docNum, short attNum) {
if (node != null) {
// Left Branch
attNum = generateTestDataArray(testDocBase,node.beforeBranch,
docNum,attNum);
// Process phrase at node (method in TestSetDocumentBase class).
testDocBase.findInTestSetDoc(node.phrase,node.sigWord,
node.indexSigWord,docNum,attNum);
// Right Branch
attNum = generateTestDataArray(testDocBase,node.afterBranch,
docNum,(short) (attNum+1));
// Return
return(attNum);
}
else return(attNum);
}
/* ---------------------------------------------------------- */
/* */
/* GET METHODS */
/* */
/* ---------------------------------------------------------- */
/* NUMBER OF NODES IN PHRASE BIN TREE */
/** Commences process of counting number of nodes in phrase bin tree.
@return the number of unpruned nodes. */
public int getNumNodesInPhraseBinTree() {
final int COUNTER=0;
return(getNumNodesInPhraseBinTree(phraseBinTreeStart,COUNTER));
}
/** Continues process of counting number of nodes in phrase bin tree.
@param node the current location in the bin tree.
@param countSoFar the number of unpruned nodes counted so far.
@return the number of unpruned nodes. */
public int getNumNodesInPhraseBinTree(PhraseBinTreeNode node,
int countSofar) {
if (node != null) {
countSofar = countSofar+1;
// Process before branch
countSofar = getNumNodesInPhraseBinTree(node.beforeBranch,
countSofar);
// Process after branch
countSofar = getNumNodesInPhraseBinTree(node.afterBranch,
countSofar);
}
// Return
return(countSofar);
}
/* NUMBER OF PROPPER PHRASES IN BIN TREE */
/** Commences process of counting number of proper phrases in the
phrase bin tree.
@return the number of proper phrase. */
private int getNumProperPhrasesInBinTree() {
final int COUNTER=0;
return(getNumProperPhrasesInBinTree(phraseBinTreeStart,COUNTER));
}
/** Continues process of counting number of proper phrases in the
phrase bin tree.
@param node the current location in the bin tree.
@param countSoFar the number of unpruned nodes counted so far.
@return the number of unpruned nodes. */
public int getNumProperPhrasesInBinTree(PhraseBinTreeNode node,
int countSofar) {
if (node != null) {
if (node.phrase.length>1) countSofar = countSofar+1;
// Process before branch
countSofar = getNumProperPhrasesInBinTree(node.beforeBranch,
countSofar);
// Process after branch
countSofar = getNumProperPhrasesInBinTree(node.afterBranch,
countSofar);
}
// Return
return(countSofar);
}
/* ------------------------------------------------------------------- */
/* */
/* BOOLEAN TEST METHODS */
/* */
/* ------------------------------------------------------------------- */
/* NONE */
/* ---------------------------------------------------------------- */
/* */
/* DIAGNOSTIC OUTPUT */
/* */
/* ---------------------------------------------------------------- */
/* OUTPUTS TRAINING DATA ARRAY STATISTICS */
/** Output trainimng set statistics: average number of attributes per
record and number of empty records (except class attribute); and average
number of attributes per record per class and number of empty records per
class.
@param numOneItemSets the number of one item sets in the data set.
@param numClasses the number of classes represented in the data set. */
/*public void outputTrainingSetStats(int numOneItemSets, int numClasses) {
System.out.println("TRAINING SET STATS:\n===================");
int offset = numClasses-numOneItemSets-1;
int numAtts = 0;
int numDocs = 0;
int numMTrecs = 0;
// Initilise array to hold class data
int[] numAttsPerClass = new int[numClasses];
int[] numDocsPerClass = new int[numClasses];
int[] numMTrecsPerClass = new int[numClasses];
// Process training data array
for (int docIndex=0;docIndex data
array is a 2-D array, first index represenmts the record/document number
and the second the phrase index. Each cell cintains an attribute number
describing a phrase. Used for local testing only. */
/*private void outputDataArrayAttributes() {
System.out.println("TRAINING ATTRIBUTE SET (ATTRIBUTES)");
System.out.println("===================================");
// Process training data array
for (int docIndex=0;docIndex data array is a
2-D array, first index represenmts the record/document number and the
second the phrase index. Each cell cintains an attribute number describing
a phrase.
@param phraseMiningObj the current instance of the class
PhraseMining (required to get the class label). */
/* public void outputDataArrayPhrases(PhraseMining phraseMiningObj) {
System.out.println("TRAINING ATTRIBUTE SET (PHRASES)");
System.out.println("================================");
for (int docIndex=0;docIndexnPhrases) return(phraseNum);
// Process node
if (node != null) {
// Process before branch
phraseNum = outputPhrasesN(node.beforeBranch,phraseNum,nPhrases);
if (phraseNum>nPhrases) return(phraseNum);
// Output phrase at node
System.out.println("{[" + phraseNum + "] " + node);
phraseNum++;
if (phraseNum>nPhrases) return(phraseNum);
// Process after branch
phraseNum = outputPhrasesN(node.afterBranch,phraseNum,nPhrases);
}
// End
return(phraseNum);
}
/* OUTPUT PHRASE BIN TREE. */
/** Starts recursive process of outputting phrase bin tree */
public void outputPhraseBinTree() {
System.out.println("PHRASE BIN TREE");
System.out.println("===============");
int attributeNumber = 1;
outputPhraseBinTree(phraseBinTreeStart,"0",attributeNumber);
System.out.println("\n");
}
/** Continues process of outputting bin tree (without pruned nodes).
@param node the current location in the bin tree.
@param num the node number (identifier).
@param attNum the attribute number.
@return the attribute number sofar. */
private int outputPhraseBinTree(PhraseBinTreeNode node, String num,
int attNum) {
if (node != null) {
// Process before branch
attNum = outputPhraseBinTree(node.beforeBranch,num+".0",attNum);
// Output phrase at node
System.out.println("{[" + num + "][" + attNum + "] " + node + "}");
attNum++;
// Process after branch
attNum = outputPhraseBinTree(node.afterBranch,num+".1",attNum);
}
// End
return(attNum);
}
/** Output phrase (used for diagnostic purposes only).
@param phrase the given phrase. */
private void outputPhrase(String[] phrase) {
for (int index=0;index