/* -------------------------------------------------------------------------- */ /* */ /* PHRASE BINARY TREE */ /* */ /* Frans Coenen */ /* */ /* Wednesday 21 December 2005 */ /* (Revised 3/2/2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ //package lucsKDD_ARM; /** Contains methods to generate a binary tree to describe all the phrases representesd by a set of documents. Generated by processing the training document base using methods in TrainingSetDocumentBase commencing with the method generatePhraseBinTree. Also conatins the methods to recast the input training set in terms of a set of attributes representing phrases and to output the training set. @author Frans Coenen @version 21 December 2005. */ public class PhraseBinTree { /* ------------------------------- */ /* */ /* FIELDS */ /* */ /* ------------------------------- */ /** Reference to start of phrase bin tree. */ private PhraseBinTreeNode phraseBinTreeStart = null; /** 2D array in which to store traing set of binary valued ARM data. */ private short[][] trainingDataArray = null; /* ------------------------------------ */ /* */ /* CONSTRUCTORS */ /* */ /* ------------------------------------ */ // Defulat constructor only /* ------------------------------------ */ /* */ /* METHOS */ /* */ /* ------------------------------------ */ /* ---------------------------------------------------- */ /* METHODS TO GENERATE BIN TREE */ /* ---------------------------------------------------- */ /** Starts recursive process of adding phrase to bin tree. @param phrase to be added. @param docNum the ID number of the current document. @param fstSigWordInPhrase the first significant word in the phrase. @param indexOfFstSigWordInPhrase the index of the first significant word in the phrase. */ public PhraseBinTreeNode addToPhraseBinTree(String[] phrase, int docNum, String fstSigWordInPhrase, int indexOfFstSigWordInPhrase) { // Check for empty tree. if (phraseBinTreeStart==null) { phraseBinTreeStart = new PhraseBinTreeNode(phrase,docNum, fstSigWordInPhrase,indexOfFstSigWordInPhrase); return(phraseBinTreeStart); } else return(addToPhraseBinTree(phraseBinTreeStart,phrase,docNum, fstSigWordInPhrase,indexOfFstSigWordInPhrase)); } /** Continues process of adding phrase to bin tree. @param node the current location in the phrase bin tree. @param phrase to be added. @param docNum the ID number of the current document. @param sigWord the first significant word in the phrase. @param indexSigWord the index of the first significant word in the phrase. */ private PhraseBinTreeNode addToPhraseBinTree(PhraseBinTreeNode currentNode, String[] phrase, int docNum, String sigWord, int indexSigWord) { int selector = comparePhrase(phrase,currentNode.phrase); // Comparison: If selector equals 0 found identical phrase if (selector == 0) { if (docNumberDoesNotExist(currentNode,docNum)) addToDocNumbers(currentNode,docNum); return(currentNode); } // Left hand branch else if (selector < 0) { if (currentNode.beforeBranch==null) { PhraseBinTreeNode newPhraseBinTreeNode = new PhraseBinTreeNode(phrase,docNum, sigWord,indexSigWord); currentNode.beforeBranch = newPhraseBinTreeNode; return(newPhraseBinTreeNode); } else return(addToPhraseBinTree(currentNode.beforeBranch,phrase, docNum,sigWord,indexSigWord)); } // Rught hand branch else { if (currentNode.afterBranch==null) { PhraseBinTreeNode newPhraseBinTreeNode = new PhraseBinTreeNode(phrase,docNum, sigWord,indexSigWord); currentNode.afterBranch = newPhraseBinTreeNode; return(newPhraseBinTreeNode); } else return(addToPhraseBinTree(currentNode.afterBranch,phrase, docNum,sigWord,indexSigWord)); } } /** Compare phrases to determine relative lexicographical location. @param phrase the given phrase. @param newPhrase the phrase representwed by the current node. @param return 0 if the same, -1 if phrase is lexicographically before the node phrase and 1 otherwise. */ public static int comparePhrase(String[] phrase, String[] nodePhrase) { int same = 0, before = -1, after = 1; for (int index=0;index0) return(after); } // If phrases the same length then they are the same if (phrase.length==nodePhrase.length) return(same); // Otherwise noderPhrase same as start of phrase but node phrase is // longer. return(before); } /** public void setAdds a document number to the document list for the current word. @param node the current node in the phrase bin tree. @param docNum the ID number of the current document. */ private void addToDocNumbers(PhraseBinTreeNode node, int newDocNum) { // Declare new document numbers array int length = node.docNumbers.length+1; int[] newDocNums = new int[length]; // Copy existing document numbers int index=0; for (;index Proceeds by processing phrase bin tree node by node, each phrase bin tree node includes a list of the documents in which the phrase represented by the node are contained. Note that documents are numbered from 1 onwards while the data array is indexed starting from zero therefore it is necessary to subtract 1 from the document number to obtian the array index. @param node the current location in the phrase bin tree. @param attNum the current attrivute number. */ private short generateDataArray(PhraseBinTreeNode node, short attNum) { if (node != null) { // Left Branch attNum = generateDataArray(node.beforeBranch,attNum); // Process node addToDataArray(node.docNumbers,attNum); // Right Branch attNum = generateDataArray(node.afterBranch,(short) (attNum+1)); } // End return(attNum); } /** Adds phrase attibute to appropriate locations in data array. Note that documents are numbered from 1 onwards while the data array is indexed starting from zero therefore it is necessary to subtract 1 from the document number to obtian the array index. @param docNumbers the list of documents in which the attribute appears. @param attNum the current attrivute number. */ private void addToDataArray(int[] docNumbers, short attNum) { // Process document numbers array for(int docNumIndex=0;docNumIndex Proceeds in a recursive manner by processing the phrase bin tree. @param testDocBase the given instance of the class TestSetDocBase. @param node the current location in the phrase bin tree. @param docNum the current document number. @param attNum the current attribute number. */ private short generateTestDataArray(TestSetDocumentBase testDocBase, PhraseBinTreeNode node, int docNum, short attNum) { if (node != null) { // Left Branch attNum = generateTestDataArray(testDocBase,node.beforeBranch, docNum,attNum); // Process phrase at node (method in TestSetDocumentBase class). testDocBase.findInTestSetDoc(node.phrase,node.sigWord, node.indexSigWord,docNum,attNum); // Right Branch attNum = generateTestDataArray(testDocBase,node.afterBranch, docNum,(short) (attNum+1)); // Return return(attNum); } else return(attNum); } /* ---------------------------------------------------------- */ /* */ /* GET METHODS */ /* */ /* ---------------------------------------------------------- */ /* NUMBER OF NODES IN PHRASE BIN TREE */ /** Commences process of counting number of nodes in phrase bin tree. @return the number of unpruned nodes. */ public int getNumNodesInPhraseBinTree() { final int COUNTER=0; return(getNumNodesInPhraseBinTree(phraseBinTreeStart,COUNTER)); } /** Continues process of counting number of nodes in phrase bin tree. @param node the current location in the bin tree. @param countSoFar the number of unpruned nodes counted so far. @return the number of unpruned nodes. */ public int getNumNodesInPhraseBinTree(PhraseBinTreeNode node, int countSofar) { if (node != null) { countSofar = countSofar+1; // Process before branch countSofar = getNumNodesInPhraseBinTree(node.beforeBranch, countSofar); // Process after branch countSofar = getNumNodesInPhraseBinTree(node.afterBranch, countSofar); } // Return return(countSofar); } /* NUMBER OF PROPPER PHRASES IN BIN TREE */ /** Commences process of counting number of proper phrases in the phrase bin tree. @return the number of proper phrase. */ private int getNumProperPhrasesInBinTree() { final int COUNTER=0; return(getNumProperPhrasesInBinTree(phraseBinTreeStart,COUNTER)); } /** Continues process of counting number of proper phrases in the phrase bin tree. @param node the current location in the bin tree. @param countSoFar the number of unpruned nodes counted so far. @return the number of unpruned nodes. */ public int getNumProperPhrasesInBinTree(PhraseBinTreeNode node, int countSofar) { if (node != null) { if (node.phrase.length>1) countSofar = countSofar+1; // Process before branch countSofar = getNumProperPhrasesInBinTree(node.beforeBranch, countSofar); // Process after branch countSofar = getNumProperPhrasesInBinTree(node.afterBranch, countSofar); } // Return return(countSofar); } /* ------------------------------------------------------------------- */ /* */ /* BOOLEAN TEST METHODS */ /* */ /* ------------------------------------------------------------------- */ /* NONE */ /* ---------------------------------------------------------------- */ /* */ /* DIAGNOSTIC OUTPUT */ /* */ /* ---------------------------------------------------------------- */ /* OUTPUTS TRAINING DATA ARRAY STATISTICS */ /** Output trainimng set statistics: average number of attributes per record and number of empty records (except class attribute); and average number of attributes per record per class and number of empty records per class. @param numOneItemSets the number of one item sets in the data set. @param numClasses the number of classes represented in the data set. */ /*public void outputTrainingSetStats(int numOneItemSets, int numClasses) { System.out.println("TRAINING SET STATS:\n==================="); int offset = numClasses-numOneItemSets-1; int numAtts = 0; int numDocs = 0; int numMTrecs = 0; // Initilise array to hold class data int[] numAttsPerClass = new int[numClasses]; int[] numDocsPerClass = new int[numClasses]; int[] numMTrecsPerClass = new int[numClasses]; // Process training data array for (int docIndex=0;docIndex data array is a 2-D array, first index represenmts the record/document number and the second the phrase index. Each cell cintains an attribute number describing a phrase. Used for local testing only. */ /*private void outputDataArrayAttributes() { System.out.println("TRAINING ATTRIBUTE SET (ATTRIBUTES)"); System.out.println("==================================="); // Process training data array for (int docIndex=0;docIndex data array is a 2-D array, first index represenmts the record/document number and the second the phrase index. Each cell cintains an attribute number describing a phrase. @param phraseMiningObj the current instance of the class PhraseMining (required to get the class label). */ /* public void outputDataArrayPhrases(PhraseMining phraseMiningObj) { System.out.println("TRAINING ATTRIBUTE SET (PHRASES)"); System.out.println("================================"); for (int docIndex=0;docIndexnPhrases) return(phraseNum); // Process node if (node != null) { // Process before branch phraseNum = outputPhrasesN(node.beforeBranch,phraseNum,nPhrases); if (phraseNum>nPhrases) return(phraseNum); // Output phrase at node System.out.println("{[" + phraseNum + "] " + node); phraseNum++; if (phraseNum>nPhrases) return(phraseNum); // Process after branch phraseNum = outputPhrasesN(node.afterBranch,phraseNum,nPhrases); } // End return(phraseNum); } /* OUTPUT PHRASE BIN TREE. */ /** Starts recursive process of outputting phrase bin tree */ public void outputPhraseBinTree() { System.out.println("PHRASE BIN TREE"); System.out.println("==============="); int attributeNumber = 1; outputPhraseBinTree(phraseBinTreeStart,"0",attributeNumber); System.out.println("\n"); } /** Continues process of outputting bin tree (without pruned nodes). @param node the current location in the bin tree. @param num the node number (identifier). @param attNum the attribute number. @return the attribute number sofar. */ private int outputPhraseBinTree(PhraseBinTreeNode node, String num, int attNum) { if (node != null) { // Process before branch attNum = outputPhraseBinTree(node.beforeBranch,num+".0",attNum); // Output phrase at node System.out.println("{[" + num + "][" + attNum + "] " + node + "}"); attNum++; // Process after branch attNum = outputPhraseBinTree(node.afterBranch,num+".1",attNum); } // End return(attNum); } /** Output phrase (used for diagnostic purposes only). @param phrase the given phrase. */ private void outputPhrase(String[] phrase) { for (int index=0;index