/* -------------------------------------------------------------------------- */ /* */ /* WORD BINARY TREE */ /* */ /* Frans Coenen */ /* */ /* Tuesday 20 December 2005 */ /* (Modified Monday 3 July 2006, Tuesday 11 July 2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ //package lucsKDD_ARM; /** Contains methods to generate a binary tree to store all the words representesd by a set of documents. Each node in the tree is an instance of the class WordBinTreeNode. The methods in this class also process the word bin tree to identify significant words. @author Frans Coenen @version 20 December 2005 */ public class WordBinTree { /* ------------------------------- */ /* */ /* FIELDS */ /* */ /* ------------------------------- */ /** Maximum number of significant word allowed. */ private int maxNumSigWords = 2500; /** The number of classes. */ private int numClasses = 0; /** The given significance index. */ private double sigIndex = 0.0; /* REFERENCES */ /** Reference to start of bin tree. */ private WordBinTreeNode wordBinTreeStart = null; /* DATA STRUCTURE */ /** 2D array in which to store training set of binary valued ARM data, each attribute representing a keyword. */ private short[][] trainingDataArray = null; /** Array of PotentialSigWord nodes in which to store the list of words from which the final set of significant words will be selected. */ private PotentialSigWord[] potSigWordList = null; /** Documents per class in training set. */ private int[] docsPerClass = null; /* DIAGNOSTIC FIELDS */ /** Total number of words in document base. */ private int totalNwordsInDB = 0; /** Number of pruned words above Upper Noise Threshold (UNT) . */ private int wordsAboveUNT = 0; /** Number of pruned words below Lower Noise Threshold (LNT) . */ private int wordsBelowLNT = 0; /** Number of non-distinguishing words. */ private int ordinaryWords = 0; /** Number of potential signifuicant words (i.e. number of enties in potential significant word list). */ private int numPotSigWords = 0; /** Number of significant words. */ private int sigWords = 0; /** Total number of words in documenmt base per class. */ private int[] totalNwordsInDBperClass = null; /* FLAGS */ /** Significant word contribution calculation strategy. */ private String sigWordContCalcStrat = null; /** Potential significant word list generation strategy. */ private String potSWlistGenStrat = null; /** Significant word selection strategy. */ private String sigWordSelectStrat = null; /* ------------------------------------ */ /* */ /* CONSTRUCTORS */ /* */ /* ------------------------------------ */ /** three argument constructor. @param nClasses the number of classes represented in the documnet base. @param the newMaxValue the desired maximum number of permitted significant words value. @param sigI the given sitgnificance index. @param newSWcontCalcStrat desired significant word contribution calculation strategy. @param newPotSWgenStrat desired potential significant word list generation strategy. @param newSWselectStrat desired significant word selection strategy.*/ public WordBinTree(int nClasses, int newMaxValue, double sigI, String newSWcontCalcStrat, String newPotSWgenStrat, String newSWselectStrat) { numClasses = nClasses; maxNumSigWords = newMaxValue; sigIndex = sigI; sigWordContCalcStrat = newSWcontCalcStrat; potSWlistGenStrat = newPotSWgenStrat; sigWordSelectStrat = newSWselectStrat; // Define total number of words in document base per class array. totalNwordsInDBperClass = new int[nClasses]; for (int index=0;indexWords below or above the noise thresholds are marked as such in the bin tree. Remaining words are considered (for the time being to be ordionary) but the "contribution" to each class is calculated. Bin tree Pruning is only appropriate for certain significant/keyword identification processes. @param lowerNT the lower noise threshold in terms of number of documents. @param upperNT the upper noise threshold in terms of number of documents. @param numRows the number of records in training set. */ public void pruneWordBinTree(int lowerNT, int upperNT, int numRows) { // Outout significance contribution calculation stratgey System.out.println("Significance contribution calculation stratgey = " + sigWordContCalcStrat); // Procced pruneWordBinTree(wordBinTreeStart,lowerNT,upperNT,numRows); } /** Continues process of identifying ("pruning") words below or above support thresholds to leave only potential significant words.

If a word is potentially significant it is first described as an ordinary word, later when the potential significant word list is created and processed these ordinary (potentially significant) words may be redesignated as significant words. @param node the current location in the bin tree. @param lowerNT the lower support threshold. @param upperNT the upper support threshold. @param numRows the number of records in training set. */ private void pruneWordBinTree(WordBinTreeNode node, int lowerNT, int upperNT, int numRows) { if (node != null) { if (node.support<=lowerNT) { wordsBelowLNT++; node.isLowerNoiseWord = true; } else if (node.support>=upperNT) { wordsAboveUNT++; node.isUpperNoiseWord = true; } else { calculateContribution(node,numRows); ordinaryWords++; node.isOrdWord = true; } // Class support array (containing support counts per class) // no longer required so set to null; node.classSupport = null; // Continue pruneWordBinTree(node.beforeBranch,lowerNT,upperNT,numRows); pruneWordBinTree(node.afterBranch,lowerNT,upperNT,numRows); } } /* ----------------------------------------------------------- */ /* */ /* CALCULATE SIGNIFICANCE CONTRIBUTION */ /* */ /* ----------------------------------------------------------- */ /** Commences provess of calculating signifiance contribution by selecting desired calculation strategy.

The support count is the number of documents the word appears in (count of 1 per document), the frequency count is the number of times the word appears in the entire document set (count of n per document). @param node the current location in the bin tree. @param numRows the number of records in training set. */ private void calculateContribution(WordBinTreeNode node, int numRows) { // Select stratgey: (i) word support, (ii) word frequency or (iii) //error. if (sigWordContCalcStrat.equals("Word Support")) calcContribWordSup(node,numRows); else if (sigWordContCalcStrat.equals("Word Frequency")) calcContribWordFreq(node,numRows); else System.out.println("ERROR: Unrecognised significant word " + "contribution selection strategy"); } /* CALCULATE CONTRIBUTION (WORD SUPPORT) */ /** Calculates the signifcance contribution of the word which is later used to determine whether given word is a distinguishing (significant) word or not, i.e. can be used to distinguish between classes or otherwise using the word support calculation strategy.

Proceed as follows:

  1. Calculate local support for word in each class in terms of the number occurances of the word in the class with respect to the number of documents in the class.
  2. Calculate the global support.
  3. Find the contibution (fraction) of each local support value to the global support value.
If contribution does not exceed the desired index (G) this indicates that the word does not serve to differentiate between classes (note that given an appropriate value of G it is possible for a word to be significant with respect to more than one class). If all contributions are 1 then the word is distributed equally over the classess. If contribution with respect to a particular class is greater than 1 then the word has some significance with respect to the class. @param node the current location in the bin tree. @param numRows the number of records in training set. */ private void calcContribWordSup(WordBinTreeNode node, int numRows) { // Calculate (percenatge) local support for word for each class double[] localSups = new double[numClasses]; double total = 0.0; for (int index=0;indexThese are ordinary words that serve to distinguish between classes. A word is not significant if it serves to distinguish between groups of 2 or more classes. Note that contribution value is rased by 10^2 so sigIndix value must also be raised by the same amount. */ public void generatePotentialSigWordList() { System.out.println("Potential Sig. Word. list generation stratgey = " + potSWlistGenStrat); // Include all words with contribution above G in potential significant // word list (likely to include words that are significant for more than // one class). if (potSWlistGenStrat.equals("All Significant Words")) genPotSWlistWithAllSW(); // Include all unique words with contribution above G in potential // significant word list (i.e. words that are potentially significant // for one class only). else if (potSWlistGenStrat.equals("Unique Sig. Words")) genPotSWlistWithUniqueSW(); // Error else System.out.println("ERROR: Unrecognised potential significant " + "word list generation strategy."); } /* GENERATE POTENTIAL SIGNIFICANT WORD LIST WITH ALL SIGNIFICANT WORDS */ /** Commences process of generating list of potential significant words where list contains all available potential significant words, i.e. those with a significance contribution above the given significance threshold (G).

Note that contribution value is rased by 10^2 so sigIndix value must also be raised by the same amount. */ private void genPotSWlistWithAllSW() { // Raise significance index (so that contributions can be output to two // decimal places). int sigI = (int) (sigIndex*100.0); // Get number of potentially significant words and check numPotSigWords = getNumPotSigWords(sigI); if (numPotSigWords==0) return; // Dimension potential significant words list potSigWordList = new PotentialSigWord[numPotSigWords]; // Populate list by processing word bin tree int pswlIndex=0; populatePotSWlist(wordBinTreeStart,pswlIndex,sigI); // Order list sortPotSigWordList(); } /** Recursively populates list of potential significant words by processing word bin tree.

Word is only entered once into the potential significant word list even if it is potentially significant with respect to a number of classes. @param node the current word bin tree node in the recirsion @param pswlIndex the current index in to the potential significant words list. @param sigI the significance index reaised by 100. @return the new index in to the potential significant words list.*/ private int populatePotSWlist(WordBinTreeNode node, int pswlIndex, int sigI) { if (node != null) { // Process before branch pswlIndex = populatePotSWlist(node.beforeBranch,pswlIndex,sigI); // Process node by first finding class with best contribution. If // this is better than significance threshold add to potential // significant word list. double bestContrib = node.contribution[0]; short bestIndex = 0; for (short index=1;indexbestContrib) { bestContrib = node.contribution[index]; bestIndex = index; } } // Add to list? if (bestContrib>sigI) { potSigWordList[pswlIndex] = new PotentialSigWord(node,bestIndex); pswlIndex = pswlIndex+1; } // Process after branch pswlIndex = populatePotSWlist(node.afterBranch,pswlIndex,sigI); } // Return return(pswlIndex); } /* GET NUMBER OF POTENTIAL SIGNIFICANT WORDS. */ /** Gets the number of (potential) significant words.

A word is counted more once only regardless of whether it is potentially significant for more than one class. Note also that the contribution value hasd been raised by 10^2 so sigIndix value must also be raised by the same amount. @param sifIndex the significance index raised by 100. @return the count so far. */ private int getNumPotSigWords(int sigI) { int count=0; // Generate count if (wordBinTreeStart!=null) count = getNumPotSigWords(wordBinTreeStart,sigI,count); // Return return(count); } /** Continues process of counting number of potential significant words.

Contribution value is rased by 10^2 so sigIndix value has also (temporarily) been raised by the same amount. @param node the current location in the bin tree. @param sifIndex the significance index raised by 100. @param countSoFar the current count. @return the count so far. */ private int getNumPotSigWords(WordBinTreeNode node, int sigI, int countSoFar) { // Process node if (node != null) { // Process before branch countSoFar = getNumPotSigWords(node.beforeBranch,sigI,countSoFar); // Process node. for (int index=0;indexsigI) { countSoFar=countSoFar+1; break; } } // Process after branch countSoFar = getNumPotSigWords(node.afterBranch,sigI,countSoFar); } // Return return(countSoFar); } /* GENERATE POTENTIAL SIGNIFICANT WORD LIST WITH UNIQUE SIGNIFICANT WORDS */ /** Commences process of generating list of unique potential significant words.

Thus a candidate word is not considered potentially significant if it serves to distinguish between groups of 2 or more classes. Note also that contribution value is rased by 10^2 so sigIndix value must also be raised by the same amount. */ private void genPotSWlistWithUniqueSW() { // Raise significance index (so that contributions can be output to two // decimal places). int sigI = (int) (sigIndex*100.0); // Get number of potentially significant words and check numPotSigWords = getNumPotUniqueSigWords(sigI); if (numPotSigWords==0) return; // Dimension potential significant words list potSigWordList = new PotentialSigWord[numPotSigWords]; // Populate list by processing word bin tree int pswlIndex=0; populateUniquePotSWlist(wordBinTreeStart,pswlIndex,sigI); // Order list sortPotSigWordList(); } /** Recursively populates list of unique potential significant words by processing word bin tree. @param node the current word bin tree node in the recirsion @param pswlIndex the current index in to the potential significant words list. @param sigI the significance index reaised by 100. @return the new index in to the potential significant words list.*/ private int populateUniquePotSWlist(WordBinTreeNode node, int pswlIndex, int sigI) { if (node != null) { // Process before branch pswlIndex = populateUniquePotSWlist(node.beforeBranch,pswlIndex, sigI); // Process node, count number of classes for which word is // potentially significant and if this is 1 add to the potential // significant word list int tempCounter = 0; short classIndex = 0; for (short index=0;indexsigI) { tempCounter++; classIndex=index; } } if (tempCounter==1) { potSigWordList[pswlIndex] = new PotentialSigWord(node,classIndex); pswlIndex = pswlIndex+1; } // Process after branch pswlIndex = populateUniquePotSWlist(node.afterBranch,pswlIndex,sigI); } // Return return(pswlIndex); } /* GET NUMBER OF POTENTIAL UNIQUE SIGNIFICANT WORDS. */ /** Gets the number of inique potential significant words in the word bin tree.

Note that contribution value is rased by 10^2 so sigIndix value must also be raised by the same amount. @param sifIndex the significance index raised by 100. @return the count so far. */ private int getNumPotUniqueSigWords(int sigI) { int count=0; // Generate count if (wordBinTreeStart!=null) count = getNumPotUniqueSigWords(wordBinTreeStart,sigI,count); // Return return(count); } /** Continues process of counting number of potential significant words.

(Contribution value has been rased by 10^2 so sigIndix value has also (temporarily) been raised by the same amount. @param node the current location in the bin tree. @param sifIndex the significance index raised by 100. @param countSoFar the current count. @return the count so far. */ private int getNumPotUniqueSigWords(WordBinTreeNode node, int sigI, int countSoFar) { // Process node if (node != null) { // Process before branch countSoFar = getNumPotUniqueSigWords(node.beforeBranch,sigI,countSoFar); // Process node, count number of classes for which word is // potentially significant and if this is 1 increment count by 1. int tempCounter = 0; //System.out.print("Word = " + node.word); for (int index=0;indexsigI) tempCounter++; } //System.out.println(); if (tempCounter==1) countSoFar=countSoFar+1; // Process after branch countSoFar = getNumPotUniqueSigWords(node.afterBranch,sigI,countSoFar); } // Return return(countSoFar); } /* -------------------------------------------- */ /* */ /* ID SIGNIFICANT WORDS */ /* */ /* -------------------------------------------- */ /** Commences process of identifying significant word by selecting desired strategy. @return the number of significant words (used in keyword only mode). */ public int idSignificantWords() { // Check if any potential significant words exist if (potSigWordList==null) { System.out.println("WARNING: No potential significant words " + "identified\n"); return(sigWords); } // Output System.out.println("Sig. Word. identification stratgey = " + sigWordSelectStrat + ", N is " + maxNumSigWords + " words."); // Select strategy: (i) First N words, (ii) First N words with even // distribution (ED) across classes or (iii) error. if (sigWordSelectStrat.equals("First N")) idSignificantWordsAll(); else if (sigWordSelectStrat.equals("First N (distributed)")) idSignificantWordsWithED(); else System.out.println("ERROR: Unrecognised significant word " + "selection strategy"); // Return return(sigWords); } /* IDENTIFY FIRST N SIGNIFICANY WORDS WITH EVEN DISTRIBUTION */ /** Identify first N significant words in the potential significant words list.

All potential significant words are iniatially identified as ordinary words, thus for each allocated significant word the ordinary word count must be decremented by one and the significant word count incremented by one. */ private void idSignificantWordsAll() { // Determine end int end = 0; if (potSigWordList.lengthIf there are not sufficient words in a class then any unassigned potential significant words, starting from the top of the list, are identified as significant words. If the number of potential significant words is less than the maximum peritted number all the potential words become significant. All potential significant words are iniatially identified as ordinary words, thus for each allocated significant word the ordinary word count must be decremented by one and the significant word count incremented by one). */ private void idSignificantWordsWithED() { // If less potentail significant words than desired maximum then all // potential significant words are considered to be significant. if (potSigWordList.length<=maxNumSigWords) idSWwithEDall(); else idSignificantWordsSome(); } /** Identifies a subset of the potential significant words list as significant by selecting words so that an even distribution across classes is obtained (if possible).

Proceed as folows: 1. Define an array to hold significant word counts per class (same length as number of classes. 2. Determine max number of sig. words per class. 3. Process potential significant word list an attempt to fill up quota for each class with significant words (note that a significant word allocated to a class may also be significant for another class. */ private void idSignificantWordsSome() { // Define an array to hold significant word counts per class. int[] counterArray = new int[numClasses]; // Determine max number of sig. words per class int classMax = maxNumSigWords/numClasses; // Process potential significant word list for (int index=0;index=maxNumSigWords) break; } // Add top N unsassigned words to make up maximum. int sigWordsStillToGo = maxNumSigWords-sigWords; // Identify remaining significant words. for (int index=0;index Proceeds by processing word bin tree node by node, each word bin tree node includes a list of the documents in which the word represented by the node are contained. Note that documents are numbered from 1 onwards while the data array is indexed starting from zero therefore it is necessary to subtract 1 from the document number to obtian the array index. If word is a key (significant) word add it to trainuing data array by including its attribute number. @param node the current location in the phrase bin tree. @param attNum the current attrivute number. */ private short generateDataArray(WordBinTreeNode node, short attNum) { if (node != null) { // Left Branch attNum = generateDataArray(node.beforeBranch,attNum); // Process node if (node.isSigWord) { addToDataArray(node.docNumbers,attNum); attNum++; } // Right Branch attNum = generateDataArray(node.afterBranch,attNum); } // End return(attNum); } /** Adds keyword attibute to appropriate locations in data array. Note that documents are numbered from 1 onwards while the data array is indexed starting from zero therefore it is necessary to subtract 1 from the document number to obtian the array index. @param docNumbers the list of documents in which the attribute appears. @param attNum the current attrivute number. */ private void addToDataArray(int[] docNumbers, short attNum) { // Process document numbers array for(int docNumIndex=0;docNumIndexProceeds in a recursive manner by processing the word bin tree. @param testDocBase the given instance of the class TestSetDocBase. @param node the current location in the word bin tree. @param docNum the current document number. @param attNum the current attribute number. */ private short generateTestDataArray(TestSetDocumentBase testDocBase, WordBinTreeNode node, int docNum, short attNum) { if (node != null) { // Left Branch attNum = generateTestDataArray(testDocBase,node.beforeBranch, docNum,attNum); // Process word at node (method in TestSetDocumentBase class). if (node.isSigWord) { testDocBase.findInTestSetDoc(node.word,docNum,attNum); attNum++; } // Right Branch attNum = generateTestDataArray(testDocBase,node.afterBranch, docNum,attNum); // Return return(attNum); } else return(attNum); } /* ------------------------------------ */ /* SET METHODS */ /* ------------------------------------ */ /** Commences processes of walking through word bin tree and, for each node, setting the notPrerviouslyInDoc flag to true ready for processing the next training set document. */ public void setDocNumbersToZero() { setDocNumbersToZero(wordBinTreeStart); } /** Continues process of of walking through word bin tree and, for each node, setting the notPrerviouslyInDoc flag to true ready for processing the next training set document. @param node the current location in the bin tree. */ private void setDocNumbersToZero(WordBinTreeNode node) { if (node != null) { // Process before branch setDocNumbersToZero(node.beforeBranch); // Process node if unpruned word node.notPrerviouslyInDoc=true; // Process after branch setDocNumbersToZero(node.afterBranch); } } /** Sets documents per class array to the given values. @param newDocsPerClass the given numbers of documents per class. */ public void setDocsPerClass(int[] newDocsPerClass) { docsPerClass = new int[newDocsPerClass.length]; // process given array for (int index=0;index NOTE: when defining documents as phrases the word bin tree gets deleted (so as to gain menmory) once the phrase bin tree has been created. This output method must therefore be included either: (i) in the loadTrainingSetDocumentBase method in the TextMining class after the word bin tree has been created, or (ii) in the processTrainingSetDocumentBase method in the PhraseMining class before the word bin tree is deleted. @return the number of unpruned nodes. */ public int getNumNodesInWordBinTree() { final int COUNTER=0; return(getNumNodesInWordBinTree(wordBinTreeStart,COUNTER)); } /** Continues process of counting number of nodes in word bin tree. @param node the current location in the bin tree. @param countSoFar the number of unpruned nodes counted so far. @return the number of unpruned nodes. */ public int getNumNodesInWordBinTree(WordBinTreeNode node, int countSofar) { if (node != null) { countSofar = countSofar+1; // Process before branch countSofar = getNumNodesInWordBinTree(node.beforeBranch, countSofar); // Process after branch countSofar = getNumNodesInWordBinTree(node.afterBranch, countSofar); } // Return return(countSofar); } /* --------------------------------------------------------------------- */ /* */ /* POTENTIAL SIGNIFICANBT WORDS BUBBLE SORT */ /* */ /* --------------------------------------------------------------------- */ /** Commences a bubble sort" on the list of potential significant words.

Has adavantage that it maintains alpha ordering. */ private void sortPotSigWordList() { boolean isOrdered; int index; do { isOrdered = true; index = 0; while (index < (potSigWordList.length-1)) { double value1 = getPotSigWordContribValue(index); double value2 = getPotSigWordContribValue(index+1); if (value1First finds the "cIndex" into the contribution array, then gets the contribution array ref, and then the value.

Used by sort routines. @param index the given index into potentia significant word list. @return the contribution. */ private int getPotSigWordContribValue(int index) { // Get "cIndex" into the contribution array int cIndex = (int) potSigWordList[index].classIndex; // Gets the contribution array ref short[] contrib = potSigWordList[index].wordBinTreeNode.contribution; // ReturnValue return(contrib[cIndex]); } /* -------------------------------------------------------------------- */ /* */ /* POTENTIAL SIGNIFICANBT WORDS QUICK SORT */ /* */ /* -------------------------------------------------------------------- */ /** Commences a "quick sort" on the list of potential significant words.

Algorithm taken from Cay Horstmann's Big Java (2nd Edsition), Wiley. */ private void sortPotSigWordListQ() { int startIndex = 0; int endIndex = potSigWordList.length-1; sortPotSigWordListQ(startIndex,endIndex); } /** Performs a "quick sort" on the list of potential significant words. @param fromIndex the start index of the current sub-array in the list. @param toIndex the end index of the current sub-array in the list. */ private void sortPotSigWordListQ(int fromIndex, int toIndex) { if (fromIndexpivot) { localFrom++; fromValue = getPotSigWordContribValue(localFrom); } localTo--; toValue = getPotSigWordContribValue(localTo); while (toValue NOTE: when defining documents as phrases the word bin tree gets deleted (so as to gain menmory) once the phrase bin tree has been created. This output method must therefore be included either: (i) in the loadTrainingSetDocumentBase method in the TextMining class after the word bin tree has been created, or (ii) in the processTrainingSetDocumentBase method in the PhraseMining class before the word bin tree is deleted. */ public void outputWordBinTreeUNW() { System.out.println("WORD BIN TREE"); System.out.println("Upper noise threshold words"); System.out.println("==========================="); if (wordBinTreeStart!=null) { int puncCounter=0; outputWordBinTreeUNW(wordBinTreeStart,puncCounter); System.out.println(); } else System.out.println("EMPTY!\n"); //End System.out.println("\n"); } /** Continues process of outputting upper noise threshold words in word bin tree. @param node the current location in the bin tree. @param puncCounter pinctuation counter. @return punctuation counter sofar. */ private int outputWordBinTreeUNW(WordBinTreeNode node, int puncCounter) { if (node != null) { // Process before branch puncCounter = outputWordBinTreeUNW(node.beforeBranch,puncCounter); // Process node if (node.isUpperNoiseWord) { System.out.print("{" + node.word + " " + node.support); // Classes for which word is significant (Note that // classSupport array have been set to null during pruning of // word Bin tree --- saves space!). if (node.classSupport!=null) { System.out.print(", classes: ["); for (int index=0;index NOTE: when defining documents as phrases the word bin tree gets deleted (so as to gain menmory) once the phrase bin tree has been created. This output method must therefore be included either: (i) in the loadTrainingSetDocumentBase method in the TextMining class after the word bin tree has been created, or (ii) in the processTrainingSetDocumentBase method in the PhraseMining class before the word bin tree is deleted. */ public void outputWordBinTreeLNW() { System.out.println("WORD BIN TREE"); System.out.println("Lower noise threshold words"); System.out.println("==========================="); if (wordBinTreeStart!=null) { int puncCounter=0; outputWordBinTreeLNW(wordBinTreeStart,puncCounter); System.out.println(); } else System.out.println("EMPTY!\n"); //End System.out.println("\n"); } /** Continues process of outputting lower noise threshold words in word bin tree. @param node the current location in the bin tree. @param puncCounter pinctuation counter. @return punctuation counter sofar. */ private int outputWordBinTreeLNW(WordBinTreeNode node, int puncCounter) { if (node != null) { // Process before branch puncCounter = outputWordBinTreeLNW(node.beforeBranch,puncCounter); // Process node if (node.isLowerNoiseWord) { System.out.print("{" + node.word + " " + node.support); // Classes which it is related to. if (node.classSupport!=null) { System.out.print(", classes: ["); for (int index=0;index NOTE: when defining documents as phrases the word bin tree gets deleted (so as to gain menmory) once the phrase bin tree has been created. This output method must therefore be included either: (i) in the loadTrainingSetDocumentBase method in the TextMining class after the word bin tree has been created, or (ii) in the processTrainingSetDocumentBase method in the PhraseMining class before the word bin tree is deleted. */ public void outputWordBinTreeOW() { System.out.println("WORD BIN TREE"); System.out.println("Ordinary words"); System.out.println("=============="); if (wordBinTreeStart!=null) { int puncCounter=0; outputWordBinTreeOW(wordBinTreeStart,puncCounter); System.out.println(); } else System.out.println("EMPTY!\n"); //End System.out.println("\n"); } /** Continues process of outputting ordinary words in word bin tree. @param node the current location in the bin tree. @param puncCounter pinctuation counter. @return punctuation counter sofar. */ private int outputWordBinTreeOW(WordBinTreeNode node, int puncCounter) { if (node != null) { // Process before branch puncCounter = outputWordBinTreeOW(node.beforeBranch,puncCounter); // Process node if (node.isOrdWord) { System.out.print("{" + node.word + " " + node.support); // Classes which it is related to. if (node.classSupport!=null) { System.out.print(", classes: ["); for (int index=0;indexisSigWordfield is set to true, this is not necessarily the same as all those words whose significance contribution exceeds the significance index (G). */ public void outputWordBinTreeSW() { System.out.println("WORD BIN TREE"); System.out.println("Significant words (support)"); System.out.println("==========================="); System.out.println("Format: (N) W S CS Contrib"); System.out.println("\tN = sequential number, W = word literal,\n\tS " + "= support count, CS = list of support counts per class " + "if available,\n\tContrib = list of contributions per " + "class if available\n\t(* indicates class(es) for " + "which word is significant)"); if (wordBinTreeStart!=null) { int counter=1; outputWordBinTreeSW(wordBinTreeStart,counter); System.out.println(); } else System.out.println("EMPTY!\n"); //End System.out.println("\n"); } /** Continues process of outputting significant words in word bin tree. @param node the current location in the bin tree. @param counter the counter for the number of significant words. @return punctuation counter sofar. */ private int outputWordBinTreeSW(WordBinTreeNode node, int counter) { if (node != null) { // Process before branch counter = outputWordBinTreeSW(node.beforeBranch,counter); // Process node if (node.isSigWord) { // Output word and support System.out.print("(" + counter + ") " + node.word + " " + node.support); // Classes supports; if (node.classSupport!=null) outputSupportClassData(node.classSupport); // Classs contributions if (node.contribution!=null) outputContribClassData(node.contribution); System.out.println(); // Increment counter counter++; } // Process after branch counter = outputWordBinTreeSW(node.afterBranch,counter); } // Return return(counter); } /** Outputs supports per classes if available. @param classData the class support data array*/ private void outputSupportClassData(int[] classData) { System.out.print(", Class supportss: ["); // Process class support array for (int index=0;indexsigIndex) System.out.print("*"); } // End System.out.print("]"); } /* OUTPUT WORD BIN TREE */ /** Starts recursive process of outputting word bin tree.

NOTE: when defining documents as phrases the word bin tree gets deleted (so as to gain menmory) once the phrase bin tree has been created. This output method must therefore be included either: (i) in the loadTrainingSetDocumentBase method in the TextMining class after the word bin tree has been created, or (ii) in the processTrainingSetDocumentBase method in the PhraseMining class before the word bin tree is deleted. */ public void outputWordBinTree() { System.out.println("FULL WORD BIN TREE"); System.out.println("=================="); if (wordBinTreeStart!=null) { int puncCounter = 0; outputWordBinTree(wordBinTreeStart,puncCounter); System.out.println(); } else System.out.println("EMPTY!\n"); //End System.out.println("\n"); } /** Continues process of outputting word bin tree with indication of nature of words stored at nodes (lower noise word (L), upper noise word (U), ordinary word (O), significant word (S)). @param node the current location in the bin tree. @param puncCounter the punctuation counter sofar. @return the current punctuation counter. */ private int outputWordBinTree(WordBinTreeNode node, int puncCounter) { if (node != null) { // Process before branch puncCounter = outputWordBinTree(node.beforeBranch,puncCounter); // Process node System.out.print("{" + node.word + " " + node.support + " "); // Label if (node.isUpperNoiseWord) System.out.print("-U "); else if (node.isLowerNoiseWord) System.out.print("-L "); else if (node.isOrdWord) System.out.print("-O "); else System.out.print("-S "); // Classes which it is related to. if (node.classSupport!=null) { System.out.print(", classes: ["); for (int index=0;indexsigIndex) { count++; sigWdsPerClass[index]++; } } // Process duplicates int duplicateCount = count-1; if (duplicateCount > 0) { for (int index=0;indexsigIndex) duplicates[index]++; } } } // Process after branch genSigWdsPerClass(node.afterBranch,sigWdsPerClass,duplicates); } } /* OUTPUT NUMBER OF POTENTIAL SIGNIFICANT WORDS PER CLASS */ /** Gets and outputs number of potential significant words per class. */ public void outputPotSigWdsPerClass() { // If no significant word list return if (potSigWordList==null) { System.out.println("Empty significant words list\n"); return; } // Generate output int[] sigWordsPerClass = new int[numClasses]; for (int index=0;index=topN) break; } System.out.println("\n"); } } /* OUTPUT POTENTIAL SIGNIFICANT WORDS LIST */ /** Outputs the potential significant words list, i.e. words in bin tree whose contribution exceeds the significance index (G) and therefore may be eventually catagorized as significant. */ public void outputPotentialSigWords() { System.out.println("Potential significant words\n"); // If no significant word list return if (potSigWordList==null) { System.out.println("Empty significant words list\n"); return; } // Start output System.out.println("#\t| Pot. Sig. Wd | Val.\t| Class\t|" + "Sig (Y/N)\n------------------------" + "-----------------------------------"); // Process array int total=0; for (int index=0;index