If a word
is potentially significant it is first described as an ordinary word, later
when the potential significant word list is created and processed these
ordinary (potentially significant) words may be redesignated as significant
words.
@param node the current location in the bin tree.
@param lowerNT the lower support threshold.
@param upperNT the upper support threshold.
@param numRows the number of records in training set. */
private void pruneWordBinTree(WordBinTreeNode node, int lowerNT,
int upperNT, int numRows) {
if (node != null) {
if (node.support<=lowerNT) {
wordsBelowLNT++;
node.isLowerNoiseWord = true;
}
else if (node.support>=upperNT) {
wordsAboveUNT++;
node.isUpperNoiseWord = true;
}
else {
calculateContribution(node,numRows);
ordinaryWords++;
node.isOrdWord = true;
}
// Class support array (containing support counts per class)
// no longer required so set to null;
node.classSupport = null;
// Continue
pruneWordBinTree(node.beforeBranch,lowerNT,upperNT,numRows);
pruneWordBinTree(node.afterBranch,lowerNT,upperNT,numRows);
}
}
/* ----------------------------------------------------------- */
/* */
/* CALCULATE SIGNIFICANCE CONTRIBUTION */
/* */
/* ----------------------------------------------------------- */
/** Commences provess of calculating signifiance contribution by selecting
desired calculation strategy.
The support count is the number of
documents the word appears in (count of 1 per document), the frequency
count is the number of times the word appears in the entire document set
(count of n per document).
@param node the current location in the bin tree.
@param numRows the number of records in training set. */
private void calculateContribution(WordBinTreeNode node, int numRows) {
// Select stratgey: (i) word support, (ii) word frequency or (iii)
//error.
if (sigWordContCalcStrat.equals("Word Support"))
calcContribWordSup(node,numRows);
else if (sigWordContCalcStrat.equals("Word Frequency"))
calcContribWordFreq(node,numRows);
else System.out.println("ERROR: Unrecognised significant word " +
"contribution selection strategy");
}
/* CALCULATE CONTRIBUTION (WORD SUPPORT) */
/** Calculates the signifcance contribution of the word which is later
used to determine whether given word is a distinguishing (significant)
word or not, i.e. can be used to distinguish between classes or otherwise
using the word support calculation strategy.
These are ordinary words that serve to distinguish between classes. A
word is not significant if it serves to distinguish between groups of 2 or
more classes. Note that contribution value is rased by 10^2 so sigIndix
value must also be raised by the same amount. */
public void generatePotentialSigWordList() {
System.out.println("Potential Sig. Word. list generation stratgey = " +
potSWlistGenStrat);
// Include all words with contribution above G in potential significant
// word list (likely to include words that are significant for more than
// one class).
if (potSWlistGenStrat.equals("All Significant Words"))
genPotSWlistWithAllSW();
// Include all unique words with contribution above G in potential
// significant word list (i.e. words that are potentially significant
// for one class only).
else if (potSWlistGenStrat.equals("Unique Sig. Words"))
genPotSWlistWithUniqueSW();
// Error
else System.out.println("ERROR: Unrecognised potential significant " +
"word list generation strategy.");
}
/* GENERATE POTENTIAL SIGNIFICANT WORD LIST WITH ALL SIGNIFICANT WORDS */
/** Commences process of generating list of potential significant words
where list contains all available potential significant words, i.e. those
with a significance contribution above the given significance threshold (G).
Note that contribution value is rased by 10^2 so sigIndix value must
also be raised by the same amount. */
private void genPotSWlistWithAllSW() {
// Raise significance index (so that contributions can be output to two
// decimal places).
int sigI = (int) (sigIndex*100.0);
// Get number of potentially significant words and check
numPotSigWords = getNumPotSigWords(sigI);
if (numPotSigWords==0) return;
// Dimension potential significant words list
potSigWordList = new PotentialSigWord[numPotSigWords];
// Populate list by processing word bin tree
int pswlIndex=0;
populatePotSWlist(wordBinTreeStart,pswlIndex,sigI);
// Order list
sortPotSigWordList();
}
/** Recursively populates list of potential significant words by processing
word bin tree.
Word is only entered once into the potential significant
word list even if it is potentially significant with respect to a number of
classes.
@param node the current word bin tree node in the recirsion
@param pswlIndex the current index in to the potential significant words
list.
@param sigI the significance index reaised by 100.
@return the new index in to the potential significant words list.*/
private int populatePotSWlist(WordBinTreeNode node, int pswlIndex,
int sigI) {
if (node != null) {
// Process before branch
pswlIndex = populatePotSWlist(node.beforeBranch,pswlIndex,sigI);
// Process node by first finding class with best contribution. If
// this is better than significance threshold add to potential
// significant word list.
double bestContrib = node.contribution[0];
short bestIndex = 0;
for (short index=1;indexbestContrib) {
bestContrib = node.contribution[index];
bestIndex = index;
}
}
// Add to list?
if (bestContrib>sigI) {
potSigWordList[pswlIndex] =
new PotentialSigWord(node,bestIndex);
pswlIndex = pswlIndex+1;
}
// Process after branch
pswlIndex = populatePotSWlist(node.afterBranch,pswlIndex,sigI);
}
// Return
return(pswlIndex);
}
/* GET NUMBER OF POTENTIAL SIGNIFICANT WORDS. */
/** Gets the number of (potential) significant words. A word is counted
more once only regardless of whether it is potentially significant for more
than one class. Note also that the contribution value hasd been raised by
10^2 so sigIndix value must also be raised by the same amount.
@param sifIndex the significance index raised by 100.
@return the count so far. */
private int getNumPotSigWords(int sigI) {
int count=0;
// Generate count
if (wordBinTreeStart!=null) count =
getNumPotSigWords(wordBinTreeStart,sigI,count);
// Return
return(count);
}
/** Continues process of counting number of potential significant words.
Contribution value is rased by 10^2 so sigIndix value has also
(temporarily) been raised by the same amount.
@param node the current location in the bin tree.
@param sifIndex the significance index raised by 100.
@param countSoFar the current count.
@return the count so far. */
private int getNumPotSigWords(WordBinTreeNode node, int sigI,
int countSoFar) {
// Process node
if (node != null) {
// Process before branch
countSoFar = getNumPotSigWords(node.beforeBranch,sigI,countSoFar);
// Process node.
for (int index=0;indexsigI) {
countSoFar=countSoFar+1;
break;
}
}
// Process after branch
countSoFar = getNumPotSigWords(node.afterBranch,sigI,countSoFar);
}
// Return
return(countSoFar);
}
/* GENERATE POTENTIAL SIGNIFICANT WORD LIST WITH UNIQUE SIGNIFICANT WORDS */
/** Commences process of generating list of unique potential significant
words. Thus a candidate word is not considered potentially significant if
it serves to distinguish between groups of 2 or more classes. Note also that
contribution value is rased by 10^2 so sigIndix value must also be raised by
the same amount. */
private void genPotSWlistWithUniqueSW() {
// Raise significance index (so that contributions can be output to two
// decimal places).
int sigI = (int) (sigIndex*100.0);
// Get number of potentially significant words and check
numPotSigWords = getNumPotUniqueSigWords(sigI);
if (numPotSigWords==0) return;
// Dimension potential significant words list
potSigWordList = new PotentialSigWord[numPotSigWords];
// Populate list by processing word bin tree
int pswlIndex=0;
populateUniquePotSWlist(wordBinTreeStart,pswlIndex,sigI);
// Order list
sortPotSigWordList();
}
/** Recursively populates list of unique potential significant words by
processing word bin tree.
@param node the current word bin tree node in the recirsion
@param pswlIndex the current index in to the potential significant words
list.
@param sigI the significance index reaised by 100.
@return the new index in to the potential significant words list.*/
private int populateUniquePotSWlist(WordBinTreeNode node, int pswlIndex,
int sigI) {
if (node != null) {
// Process before branch
pswlIndex = populateUniquePotSWlist(node.beforeBranch,pswlIndex,
sigI);
// Process node, count number of classes for which word is
// potentially significant and if this is 1 add to the potential
// significant word list
int tempCounter = 0;
short classIndex = 0;
for (short index=0;indexsigI) {
tempCounter++;
classIndex=index;
}
}
if (tempCounter==1) {
potSigWordList[pswlIndex] =
new PotentialSigWord(node,classIndex);
pswlIndex = pswlIndex+1;
}
// Process after branch
pswlIndex = populateUniquePotSWlist(node.afterBranch,pswlIndex,sigI);
}
// Return
return(pswlIndex);
}
/* GET NUMBER OF POTENTIAL UNIQUE SIGNIFICANT WORDS. */
/** Gets the number of inique potential significant words in the word bin
tree. Note that contribution value is rased by 10^2 so sigIndix value
must also be raised by the same amount.
@param sifIndex the significance index raised by 100.
@return the count so far. */
private int getNumPotUniqueSigWords(int sigI) {
int count=0;
// Generate count
if (wordBinTreeStart!=null) count =
getNumPotUniqueSigWords(wordBinTreeStart,sigI,count);
// Return
return(count);
}
/** Continues process of counting number of potential significant words.
(Contribution value has been rased by 10^2 so sigIndix value has also
(temporarily) been raised by the same amount.
@param node the current location in the bin tree.
@param sifIndex the significance index raised by 100.
@param countSoFar the current count.
@return the count so far. */
private int getNumPotUniqueSigWords(WordBinTreeNode node, int sigI,
int countSoFar) {
// Process node
if (node != null) {
// Process before branch
countSoFar = getNumPotUniqueSigWords(node.beforeBranch,sigI,countSoFar);
// Process node, count number of classes for which word is
// potentially significant and if this is 1 increment count by 1.
int tempCounter = 0;
//System.out.print("Word = " + node.word);
for (int index=0;indexsigI) tempCounter++;
}
//System.out.println();
if (tempCounter==1) countSoFar=countSoFar+1;
// Process after branch
countSoFar = getNumPotUniqueSigWords(node.afterBranch,sigI,countSoFar);
}
// Return
return(countSoFar);
}
/* -------------------------------------------- */
/* */
/* ID SIGNIFICANT WORDS */
/* */
/* -------------------------------------------- */
/** Commences process of identifying significant word by selecting desired
strategy.
@return the number of significant words (used in keyword only mode). */
public int idSignificantWords() {
// Check if any potential significant words exist
if (potSigWordList==null) {
System.out.println("WARNING: No potential significant words " +
"identified\n");
return(sigWords);
}
// Output
System.out.println("Sig. Word. identification stratgey = " +
sigWordSelectStrat + ", N is " + maxNumSigWords + " words.");
// Select strategy: (i) First N words, (ii) First N words with even
// distribution (ED) across classes or (iii) error.
if (sigWordSelectStrat.equals("First N")) idSignificantWordsAll();
else if (sigWordSelectStrat.equals("First N (distributed)"))
idSignificantWordsWithED();
else System.out.println("ERROR: Unrecognised significant word " +
"selection strategy");
// Return
return(sigWords);
}
/* IDENTIFY FIRST N SIGNIFICANY WORDS WITH EVEN DISTRIBUTION */
/** Identify first N significant words in the potential significant words
list. All potential significant words are iniatially identified as
ordinary words, thus for each allocated significant word the ordinary word
count must be decremented by one and the significant word count incremented
by one. */
private void idSignificantWordsAll() {
// Determine end
int end = 0;
if (potSigWordList.lengthIf there are not sufficient words in a class then any
unassigned potential significant words, starting from the top of the list,
are identified as significant words. If the number of potential significant
words is less than the maximum peritted number all the potential words
become significant. All potential significant words are iniatially
identified as ordinary words, thus for each allocated significant word the
ordinary word count must be decremented by one and the significant word
count incremented by one). */
private void idSignificantWordsWithED() {
// If less potentail significant words than desired maximum then all
// potential significant words are considered to be significant.
if (potSigWordList.length<=maxNumSigWords) idSWwithEDall();
else idSignificantWordsSome();
}
/** Identifies a subset of the potential significant words list as
significant by selecting words so that an even distribution across classes
is obtained (if possible).Proceed as folows:
1. Define an array to hold significant word counts per class (same length as
number of classes.
2. Determine max number of sig. words per class.
3. Process potential significant word list an attempt to fill up quota for
each class with significant words (note that a significant word allocated to
a class may also be significant for another class. */
private void idSignificantWordsSome() {
// Define an array to hold significant word counts per class.
int[] counterArray = new int[numClasses];
// Determine max number of sig. words per class
int classMax = maxNumSigWords/numClasses;
// Process potential significant word list
for (int index=0;index=maxNumSigWords) break;
}
// Add top N unsassigned words to make up maximum.
int sigWordsStillToGo = maxNumSigWords-sigWords;
// Identify remaining significant words.
for (int index=0;index Proceeds by processing word
bin tree node by node, each word bin tree node includes a list of the
documents in which the word represented by the node are contained. Note
that documents are numbered from 1 onwards while the data array is indexed
starting from zero therefore it is necessary to subtract 1 from the
document number to obtian the array index. If word is a key (significant)
word add it to trainuing data array by including its attribute number.
@param node the current location in the phrase bin tree.
@param attNum the current attrivute number. */
private short generateDataArray(WordBinTreeNode node, short attNum) {
if (node != null) {
// Left Branch
attNum = generateDataArray(node.beforeBranch,attNum);
// Process node
if (node.isSigWord) {
addToDataArray(node.docNumbers,attNum);
attNum++;
}
// Right Branch
attNum = generateDataArray(node.afterBranch,attNum);
}
// End
return(attNum);
}
/** Adds keyword attibute to appropriate locations in data array. Note
that documents are numbered from 1 onwards while the data array is indexed
starting from zero therefore it is necessary to subtract 1 from the
document number to obtian the array index.
@param docNumbers the list of documents in which the attribute appears.
@param attNum the current attrivute number. */
private void addToDataArray(int[] docNumbers, short attNum) {
// Process document numbers array
for(int docNumIndex=0;docNumIndexProceeds in a recursive manner by
processing the word bin tree.
@param testDocBase the given instance of the class TestSetDocBase.
@param node the current location in the word bin tree.
@param docNum the current document number.
@param attNum the current attribute number. */
private short generateTestDataArray(TestSetDocumentBase testDocBase,
WordBinTreeNode node, int docNum, short attNum) {
if (node != null) {
// Left Branch
attNum = generateTestDataArray(testDocBase,node.beforeBranch,
docNum,attNum);
// Process word at node (method in TestSetDocumentBase class).
if (node.isSigWord) {
testDocBase.findInTestSetDoc(node.word,docNum,attNum);
attNum++;
}
// Right Branch
attNum = generateTestDataArray(testDocBase,node.afterBranch,
docNum,attNum);
// Return
return(attNum);
}
else return(attNum);
}
/* ------------------------------------ */
/* SET METHODS */
/* ------------------------------------ */
/** Commences processes of walking through word bin tree and, for each
node, setting the notPrerviouslyInDoc flag to true ready for
processing the next training set document. */
public void setDocNumbersToZero() {
setDocNumbersToZero(wordBinTreeStart);
}
/** Continues process of of walking through word bin tree and, for each
node, setting the notPrerviouslyInDoc flag to true ready for
processing the next training set document.
@param node the current location in the bin tree. */
private void setDocNumbersToZero(WordBinTreeNode node) {
if (node != null) {
// Process before branch
setDocNumbersToZero(node.beforeBranch);
// Process node if unpruned word
node.notPrerviouslyInDoc=true;
// Process after branch
setDocNumbersToZero(node.afterBranch);
}
}
/** Sets documents per class array to the given values.
@param newDocsPerClass the given numbers of documents per class. */
public void setDocsPerClass(int[] newDocsPerClass) {
docsPerClass = new int[newDocsPerClass.length];
// process given array
for (int index=0;index NOTE: when defining documents as phrases the word bin tree gets deleted
(so as to gain menmory) once the phrase bin tree has been created. This
output method must therefore be included either: (i) in the
loadTrainingSetDocumentBase method in the TextMining
class after the word bin tree has been created, or (ii) in the
processTrainingSetDocumentBase method in the PhraseMining
class before the word bin tree is deleted.
@return the number of unpruned nodes. */
public int getNumNodesInWordBinTree() {
final int COUNTER=0;
return(getNumNodesInWordBinTree(wordBinTreeStart,COUNTER));
}
/** Continues process of counting number of nodes in word bin tree.
@param node the current location in the bin tree.
@param countSoFar the number of unpruned nodes counted so far.
@return the number of unpruned nodes. */
public int getNumNodesInWordBinTree(WordBinTreeNode node, int countSofar) {
if (node != null) {
countSofar = countSofar+1;
// Process before branch
countSofar = getNumNodesInWordBinTree(node.beforeBranch,
countSofar);
// Process after branch
countSofar = getNumNodesInWordBinTree(node.afterBranch,
countSofar);
}
// Return
return(countSofar);
}
/* --------------------------------------------------------------------- */
/* */
/* POTENTIAL SIGNIFICANBT WORDS BUBBLE SORT */
/* */
/* --------------------------------------------------------------------- */
/** Commences a bubble sort" on the list of potential significant words.
Has adavantage that it maintains alpha ordering. */
private void sortPotSigWordList() {
boolean isOrdered;
int index;
do {
isOrdered = true;
index = 0;
while (index < (potSigWordList.length-1)) {
double value1 = getPotSigWordContribValue(index);
double value2 = getPotSigWordContribValue(index+1);
if (value1First finds the "cIndex" into the contribution
array, then gets the contribution array ref, and then the value. Used by
sort routines.
@param index the given index into potentia significant word list.
@return the contribution. */
private int getPotSigWordContribValue(int index) {
// Get "cIndex" into the contribution array
int cIndex = (int) potSigWordList[index].classIndex;
// Gets the contribution array ref
short[] contrib = potSigWordList[index].wordBinTreeNode.contribution;
// ReturnValue
return(contrib[cIndex]);
}
/* -------------------------------------------------------------------- */
/* */
/* POTENTIAL SIGNIFICANBT WORDS QUICK SORT */
/* */
/* -------------------------------------------------------------------- */
/** Commences a "quick sort" on the list of potential significant words.
Algorithm taken from Cay Horstmann's Big Java (2nd Edsition), Wiley. */
private void sortPotSigWordListQ() {
int startIndex = 0;
int endIndex = potSigWordList.length-1;
sortPotSigWordListQ(startIndex,endIndex);
}
/** Performs a "quick sort" on the list of potential significant words.
@param fromIndex the start index of the current sub-array in the list.
@param toIndex the end index of the current sub-array in the list. */
private void sortPotSigWordListQ(int fromIndex, int toIndex) {
if (fromIndexpivot) {
localFrom++;
fromValue = getPotSigWordContribValue(localFrom);
}
localTo--;
toValue = getPotSigWordContribValue(localTo);
while (toValue NOTE: when defining documents as phrases the word bin tree
gets deleted (so as to gain menmory) once the phrase bin tree has been
created. This output method must therefore be included either: (i) in the
loadTrainingSetDocumentBase method in the TextMining
class after the word bin tree has been created, or (ii) in the
processTrainingSetDocumentBase method in the PhraseMining
class before the word bin tree is deleted. */
public void outputWordBinTreeUNW() {
System.out.println("WORD BIN TREE");
System.out.println("Upper noise threshold words");
System.out.println("===========================");
if (wordBinTreeStart!=null) {
int puncCounter=0;
outputWordBinTreeUNW(wordBinTreeStart,puncCounter);
System.out.println();
}
else System.out.println("EMPTY!\n");
//End
System.out.println("\n");
}
/** Continues process of outputting upper noise threshold words in word bin
tree.
@param node the current location in the bin tree.
@param puncCounter pinctuation counter.
@return punctuation counter sofar. */
private int outputWordBinTreeUNW(WordBinTreeNode node, int puncCounter) {
if (node != null) {
// Process before branch
puncCounter = outputWordBinTreeUNW(node.beforeBranch,puncCounter);
// Process node
if (node.isUpperNoiseWord) {
System.out.print("{" + node.word + " " + node.support);
// Classes for which word is significant (Note that
// classSupport array have been set to null during pruning of
// word Bin tree --- saves space!).
if (node.classSupport!=null) {
System.out.print(", classes: [");
for (int index=0;index NOTE: when defining documents as phrases the word bin tree
gets deleted (so as to gain menmory) once the phrase bin tree has been
created. This output method must therefore be included either: (i) in the
loadTrainingSetDocumentBase method in the TextMining
class after the word bin tree has been created, or (ii) in the
processTrainingSetDocumentBase method in the PhraseMining
class before the word bin tree is deleted. */
public void outputWordBinTreeLNW() {
System.out.println("WORD BIN TREE");
System.out.println("Lower noise threshold words");
System.out.println("===========================");
if (wordBinTreeStart!=null) {
int puncCounter=0;
outputWordBinTreeLNW(wordBinTreeStart,puncCounter);
System.out.println();
}
else System.out.println("EMPTY!\n");
//End
System.out.println("\n");
}
/** Continues process of outputting lower noise threshold words in word bin
tree.
@param node the current location in the bin tree.
@param puncCounter pinctuation counter.
@return punctuation counter sofar. */
private int outputWordBinTreeLNW(WordBinTreeNode node, int puncCounter) {
if (node != null) {
// Process before branch
puncCounter = outputWordBinTreeLNW(node.beforeBranch,puncCounter);
// Process node
if (node.isLowerNoiseWord) {
System.out.print("{" + node.word + " " + node.support);
// Classes which it is related to.
if (node.classSupport!=null) {
System.out.print(", classes: [");
for (int index=0;index NOTE: when defining documents as phrases the word bin tree gets
deleted (so as to gain menmory) once the phrase bin tree has been created.
This output method must therefore be included either: (i) in the
loadTrainingSetDocumentBase method in the TextMining
class after the word bin tree has been created, or (ii) in the
processTrainingSetDocumentBase method in the PhraseMining
class before the word bin tree is deleted. */
public void outputWordBinTreeOW() {
System.out.println("WORD BIN TREE");
System.out.println("Ordinary words");
System.out.println("==============");
if (wordBinTreeStart!=null) {
int puncCounter=0;
outputWordBinTreeOW(wordBinTreeStart,puncCounter);
System.out.println();
}
else System.out.println("EMPTY!\n");
//End
System.out.println("\n");
}
/** Continues process of outputting ordinary words in word bin tree.
@param node the current location in the bin tree.
@param puncCounter pinctuation counter.
@return punctuation counter sofar. */
private int outputWordBinTreeOW(WordBinTreeNode node, int puncCounter) {
if (node != null) {
// Process before branch
puncCounter = outputWordBinTreeOW(node.beforeBranch,puncCounter);
// Process node
if (node.isOrdWord) {
System.out.print("{" + node.word + " " + node.support);
// Classes which it is related to.
if (node.classSupport!=null) {
System.out.print(", classes: [");
for (int index=0;indexisSigWordfield is set to true, this
is not necessarily the same as all those words whose significance
contribution exceeds the significance index (G). */
public void outputWordBinTreeSW() {
System.out.println("WORD BIN TREE");
System.out.println("Significant words (support)");
System.out.println("===========================");
System.out.println("Format: (N) W S CS Contrib");
System.out.println("\tN = sequential number, W = word literal,\n\tS " +
"= support count, CS = list of support counts per class " +
"if available,\n\tContrib = list of contributions per " +
"class if available\n\t(* indicates class(es) for " +
"which word is significant)");
if (wordBinTreeStart!=null) {
int counter=1;
outputWordBinTreeSW(wordBinTreeStart,counter);
System.out.println();
}
else System.out.println("EMPTY!\n");
//End
System.out.println("\n");
}
/** Continues process of outputting significant words in word bin tree.
@param node the current location in the bin tree.
@param counter the counter for the number of significant words.
@return punctuation counter sofar. */
private int outputWordBinTreeSW(WordBinTreeNode node, int counter) {
if (node != null) {
// Process before branch
counter = outputWordBinTreeSW(node.beforeBranch,counter);
// Process node
if (node.isSigWord) {
// Output word and support
System.out.print("(" + counter + ") " + node.word + " " +
node.support);
// Classes supports;
if (node.classSupport!=null)
outputSupportClassData(node.classSupport);
// Classs contributions
if (node.contribution!=null)
outputContribClassData(node.contribution);
System.out.println();
// Increment counter
counter++;
}
// Process after branch
counter = outputWordBinTreeSW(node.afterBranch,counter);
}
// Return
return(counter);
}
/** Outputs supports per classes if available.
@param classData the class support data array*/
private void outputSupportClassData(int[] classData) {
System.out.print(", Class supportss: [");
// Process class support array
for (int index=0;indexsigIndex) System.out.print("*");
}
// End
System.out.print("]");
}
/* OUTPUT WORD BIN TREE */
/** Starts recursive process of outputting word bin tree. NOTE: when
defining documents as phrases the word bin tree gets deleted (so as to gain
menmory) once the phrase bin tree has been created. This output method must
therefore be included either: (i) in the
loadTrainingSetDocumentBase method in the TextMining
class after the word bin tree has been created, or (ii) in the
processTrainingSetDocumentBase method in the PhraseMining
class before the word bin tree is deleted. */
public void outputWordBinTree() {
System.out.println("FULL WORD BIN TREE");
System.out.println("==================");
if (wordBinTreeStart!=null) {
int puncCounter = 0;
outputWordBinTree(wordBinTreeStart,puncCounter);
System.out.println();
}
else System.out.println("EMPTY!\n");
//End
System.out.println("\n");
}
/** Continues process of outputting word bin tree with indication of nature
of words stored at nodes (lower noise word (L), upper noise word (U),
ordinary word (O), significant word (S)).
@param node the current location in the bin tree.
@param puncCounter the punctuation counter sofar.
@return the current punctuation counter. */
private int outputWordBinTree(WordBinTreeNode node, int puncCounter) {
if (node != null) {
// Process before branch
puncCounter = outputWordBinTree(node.beforeBranch,puncCounter);
// Process node
System.out.print("{" + node.word + " " + node.support + " ");
// Label
if (node.isUpperNoiseWord) System.out.print("-U ");
else if (node.isLowerNoiseWord) System.out.print("-L ");
else if (node.isOrdWord) System.out.print("-O ");
else System.out.print("-S ");
// Classes which it is related to.
if (node.classSupport!=null) {
System.out.print(", classes: [");
for (int index=0;indexsigIndex) {
count++;
sigWdsPerClass[index]++;
}
}
// Process duplicates
int duplicateCount = count-1;
if (duplicateCount > 0) {
for (int index=0;indexsigIndex)
duplicates[index]++;
}
}
}
// Process after branch
genSigWdsPerClass(node.afterBranch,sigWdsPerClass,duplicates);
}
}
/* OUTPUT NUMBER OF POTENTIAL SIGNIFICANT WORDS PER CLASS */
/** Gets and outputs number of potential significant words per class. */
public void outputPotSigWdsPerClass() {
// If no significant word list return
if (potSigWordList==null) {
System.out.println("Empty significant words list\n");
return;
}
// Generate output
int[] sigWordsPerClass = new int[numClasses];
for (int index=0;index=topN) break;
}
System.out.println("\n");
}
}
/* OUTPUT POTENTIAL SIGNIFICANT WORDS LIST */
/** Outputs the potential significant words list, i.e. words in bin tree
whose contribution exceeds the significance index (G) and therefore may be
eventually catagorized as significant. */
public void outputPotentialSigWords() {
System.out.println("Potential significant words\n");
// If no significant word list return
if (potSigWordList==null) {
System.out.println("Empty significant words list\n");
return;
}
// Start output
System.out.println("#\t| Pot. Sig. Wd | Val.\t| Class\t|" +
"Sig (Y/N)\n------------------------" +
"-----------------------------------");
// Process array
int total=0;
for (int index=0;index