/* -------------------------------------------------------------------------- */ /* */ /* KEY WORD BASED TEXT CLASSIFICATION */ /* */ /* Frans Coenen */ /* */ /* Friday 3 February 2006 */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Class structure AssocRuleMining | +-- TextMining | +-- KeyWordMining */ //package lucsKDD_ARM; // Java packages import java.io.*; import java.util.*; /** Class containing methods to coordinate keyword identification from training set of documents, recast the training and testset in terms of the identified keywords and then produce a classifier from the training set tested on the test set. @author Frans Coenen @version 3 February 2006. */ public class KeyWordMining extends TextMining { /* ---------------------------------------------------------------- */ /* */ /* FIELDS */ /* */ /* ---------------------------------------------------------------- */ // None /* ---------------------------------------------------------------- */ /* */ /* CONSTRUCTORS */ /* */ /* ---------------------------------------------------------------- */ /** Constructor processes command line arguments. @param args the command line arguments (array of String instances). */ public KeyWordMining(String[] args) { super(args); } /* ---------------------------------------------------------------- */ /* */ /* METHODS */ /* */ /* ---------------------------------------------------------------- */ /* ------------------------------------------------------ */ /* TOP LEVEL METHODS */ /* ------------------------------------------------------ */ /** Top level method to: (i) reads the document base and stores in a bin tree structure and (ii) cause the bin tree to be processed to identify key phrases. @return true if OK to proceed and false otherwise. */ public boolean findKeyWordsInDB() throws IOException { final boolean OK_TO_PROCEED = true; System.out.println("START ANALYSING DOC BASE FOR KEYWORDS\n"); if (docClasses==null) { System.out.println("ERROR: No classes!"); System.exit(1); } // Load document base loadTrainingSetDocumentBase(); if (outputNumDocsPerClassInTrainSetFlag) outputDocumentsPerClass(); // Process document base and generate word bin tree provided there // are not too many one itemsets. wordBinTree.setDocsPerClass(docsPerClass); if (processTrainingSetDocumentBase()) { // Process test document set loadTestSetDocumentBase(); // Output attribute sets if requested if (outputTrainSetAttNumFlag) { System.out.println("TRAINING DATA SET\n-----------------"); outputDataArray(dataArray); } if (outputTestSetAttNumFlag) { System.out.println("TEST DATA SET\n-------------"); outputDataArray(testDataArray); } return(OK_TO_PROCEED); } else return(!OK_TO_PROCEED); } /** Commences processing of word bin tree (containing training set document data) to ID: (1) noise words (words above/below noise thresholds), (2) ordinary words (words that do not serve to differentiate between classes), abd (3) significant words (words that do not serve to differentiate between classes).
The desired key words are then the
identified significant words and thus the input training data is
recast so that each document is represented according to whether the key
word is contained in the document or not. Trainining set is stored as a 2-D
array with rows representing records and columns attributes.
@return true if OK to proceed and false otherwise. */
private boolean processTrainingSetDocumentBase() {
final boolean NUM_ONE_ITEMSETS_LESS_THAN_MAX = true;
// ID noise and insignificant words from bin tree
System.out.println("ID NOISE, ORDINARY AND SIGNIFICANT WORDS IN " +
"WORDS BIN TREE\nLST = " + lowerNoiseThold + "% (" +
lowerNoiseDocThold + " docs), UST = " +
upperNoiseThold + "% (" +
upperNoiseDocThold + " docs), SI = " +
significanceIndex);
System.out.println("Max # sig words " + maxNumSigWords);
System.out.println("================================================");
// Prune word bin tree
wordBinTree.pruneWordBinTree(lowerNoiseDocThold,upperNoiseDocThold,
numRows);
// Generate potential significant words list
wordBinTree.generatePotentialSigWordList();
int numAttributes = wordBinTree.idSignificantWords();
// Output options for word bin tree
if (outputWordBinTreeFlag) wordBinTree.outputWordBinTree();
if (outputWordBinTreeUNflag) wordBinTree.outputWordBinTreeUNW();
if (outputWordBinTreeLNflag) wordBinTree.outputWordBinTreeLNW();
if (outputWordBinTreeOWflag) wordBinTree.outputWordBinTreeOW();
if (outputWordBinTreeSWflag) wordBinTree.outputWordBinTreeSW();
if (outputCount1wrdsFlag) wordBinTree.outputWordsWithCount1();
if (outputWordBinTreeStatsFlag) wordBinTree.outputWordBinTreeStats();
if (outputSigWordsPerClassFlag) wordBinTree.outputSigWordsPerClass();
if (outputPotSigWdsPerClassFlag) wordBinTree.outputPotSigWdsPerClass();
if (outputTop10sigWordsListFlag)
wordBinTree.outputTopNpotSigWords(10,docClasses);
if (outputPotSigWordsListFlag) wordBinTree.outputPotentialSigWords();
if (outputTrainSetAttMarkedFlag) docBase.outputDocBaseWithMarks();
// Create training data array
dataArray = wordBinTree.genTrainingDataArray(trainingSetIDs.length,
docBase);
// Set numOneItemSets field contained in AssocRuleMining parent class.
numOneItemSets = numAttributes+docClasses.length;
if (numOneItemSets>MAX_NUM_ONE_ITEMSETS) {
System.out.println("Number of one itemsets (" + numOneItemSets +
") exceeds maximum of " + MAX_NUM_ONE_ITEMSETS);
return(!NUM_ONE_ITEMSETS_LESS_THAN_MAX);
}
else System.out.println("Number of one itemsets = " + numOneItemSets);
if (outputTrainSetStatsFlag) outputTrainingSetStats(numOneItemSets,
docClasses.length);
// Throw away training document base and return.
docBase = null;
return(NUM_ONE_ITEMSETS_LESS_THAN_MAX);
}
/** Commences process of loading test document base and storing in an array
of arrays. */
private void loadTestSetDocumentBase() throws IOException {
System.out.println("READ TEST SET");
System.out.println("=============");
if (testSetIDs==null) {
System.out.println("ERROR: No test set!");
System.exit(1);
}
// Define testDocBase object
int numRowsInTestSet = testSetIDs.length;
testDocBase = new TestSetDocumentBase(numRowsInTestSet);
System.out.println("Num. docs. in test set = " + numRowsInTestSet);
// loop through document base.
for(int index=0;index