/* -------------------------------------------------------------------------- */ /* */ /* PHRASE BASED TEXT CLASSIFICATION */ /* */ /* Frans Coenen */ /* */ /* Tuesday 20 December 2005 */ /* (Revised January 2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Class structure AssocRuleMining | +-- TextMining | +-- PhraseMining */ //package lucsKDD_ARM; // Java packages import java.io.*; import java.util.*; /** Class containing methods to coordinate phrase identification from training set of documents, recast the training and testset in terms of the identified phrases and then produce a classifier from the training set tested on the test set. @author Frans Coenen @version 20 December 2005. */ public class PhraseMining extends TextMining { /* ---------------------------------------------------------------- */ /* */ /* FIELDS */ /* */ /* ---------------------------------------------------------------- */ /** Instance of the class PhraseBinTree in which to store words. */ protected PhraseBinTree phraseBinTree = null; // Flags /** Indicates request to output phrase bin tree. */ protected boolean outputPhraseBinTreeFlag = false; /** Indicates request to output phrase bin tree statistics. */ protected boolean outputPhraseBinTreeStatsFlag = false; /** Indicates request to output phraselist. */ protected boolean outputPhraseListFlag = false; /** Indicates request to output phraselist (first 100 items). */ protected boolean outputPhraseList100flag = false; /* ---------------------------------------------------------------- */ /* */ /* CONSTRUCTORS */ /* */ /* ---------------------------------------------------------------- */ /** Constructor processes command line arguments. @param args the command line arguments (array of String instances). */ public PhraseMining(String[] args) { super(args); // Create phraseBinTree object for later use. phraseBinTree = new PhraseBinTree(); } /* ---------------------------------------------------------------- */ /* */ /* METHODS */ /* */ /* ---------------------------------------------------------------- */ /* ------------------------------------------------------ */ /* TOP LEVEL METHODS */ /* ------------------------------------------------------ */ /** Top level method to: (i) reads/loads the words in the input document base and stores in a "word" bin tree structure and (ii) cause the "word" bin tree to be processed to identify key phrases. @return true if OK to proceed and false otherwise. */ public boolean findPhrasesInDB() throws IOException { final boolean OK_TO_PROCEED = true; System.out.println("START ANALYSING DOC BASE FOR PHRASES\n"); if (docClasses==null) { System.out.println("ERROR: No classes!"); System.exit(1); } // Load document base loadTrainingSetDocumentBase(); if (outputNumDocsPerClassInTrainSetFlag) outputDocumentsPerClass(); // Process document base and generate word and phrase bin trees. // If there are not too many one itemsets process test set. wordBinTree.setDocsPerClass(docsPerClass); if (processTrainingSetDocumentBase()) { // Process test document set loadTestSetDocumentBase(); // Output attribute sets if requested if (outputTrainSetAttNumFlag) { System.out.println("TRAINING DATA SET\n-----------------"); outputDataArray(dataArray); } if (outputTestSetAttNumFlag) { System.out.println("TEST DATA SET\n-------------"); outputDataArray(testDataArray); } return(OK_TO_PROCEED); } else return(!OK_TO_PROCEED); } /** Commences processing of word bin tree (containing training set document data) to ID: (1) noise words (words above/below noise thresholds), (2) ordinary words (words that do not serve to differentiate between classes), abd (3) significant words (words that do not serve to differentiate between classes).

The desired phrases are defined as per methods in the sub-classes of this class. Once identified the input training data is recast so that each document is represented according to whether the phrase is contained in the document or not. Trainining set is stored as a 2-D array with rows representing records and columns attributes. @return true if number of idcenrified 1-itemsets is less than the specified maximum (a constant) and false otherwise. */ private boolean processTrainingSetDocumentBase() { final boolean NUM_ONE_ITEMSETS_LESS_THAN_MAX = true; // ID noise and insignificant words from bin tree System.out.println("ID NOISE, ORDINARY AND SIGNIFICANT WORDS IN " + "WORDS BIN TREE\nLNT = " + lowerNoiseThold + "% (" + lowerNoiseDocThold + " docs), UNT = " + upperNoiseThold + "% (" + upperNoiseDocThold + " docs), SI = " + significanceIndex); System.out.println("Max # sig words " + maxNumSigWords); System.out.println("================================================"); // Prune word bin tree wordBinTree.pruneWordBinTree(lowerNoiseDocThold,upperNoiseDocThold, numRows); // Generate potential significant words list wordBinTree.generatePotentialSigWordList(); wordBinTree.idSignificantWords(); // Output options for word bin tree outputWordBinTreeOptions(); // Create phrase bin tree, throw away word bin tree and then create // training data array for DM algorithm. createPhraseBinTree(); wordBinTree=null; // Output options for phrase bin tree and assign training data array to // data array field. outputPhraseBinTreeOptions(); dataArray = phraseBinTree.genTrainingDataArray(trainingSetIDs.length, docBase); // Set numOneItemSets field contained in AssocRuleMining parent class. int numAttributes = phraseBinTree.getNumNodesInPhraseBinTree(); numOneItemSets = numAttributes+docClasses.length; if (numOneItemSets>MAX_NUM_ONE_ITEMSETS) { System.out.println("Number of one itemsets (" + numOneItemSets + ") exceeds maximum of " + MAX_NUM_ONE_ITEMSETS); return(!NUM_ONE_ITEMSETS_LESS_THAN_MAX); } else System.out.println("Number of one itemsets = " + numOneItemSets); if (outputTrainSetStatsFlag) outputTrainingSetStats(numOneItemSets, docClasses.length); // Throw away training document base. docBase = null; return(NUM_ONE_ITEMSETS_LESS_THAN_MAX); } /** Creates phrase bin tree according to definition of phrase as defined in sub class.

Overwritten in sub-classes. */ protected void createPhraseBinTree() { /* STUBB */ } /** Commences process of loading test document base and storing in an array of arrays. */ private void loadTestSetDocumentBase() throws IOException { System.out.println("READ TEST SET"); System.out.println("============="); if (testSetIDs==null) { System.out.println("ERROR: No test set!"); System.exit(1); } // Define testDocBase object int numRowsInTestSet = testSetIDs.length; testDocBase = new TestSetDocumentBase(numRowsInTestSet); System.out.println("Num. docs. in test set = " + numRowsInTestSet + "\n"); // loop through document base. for(int index=0;index "); else { outputItemSetWithAttLiterals(linkRuleNode.antecedent); System.out.print(" -> "); } // Output class (first and only attribute in the rule consequent). String ClassLiteral = getClassLiteral(linkRuleNode.consequent[0]); System.out.print(ClassLiteral + " "); System.out.println(" " + twoDecPlaces(linkRuleNode.confidenceForRule) + "%"); } /* OUTPUT ITEMSET */ /** Outputs a given item set as a set of phrase literals. @param itemSet the given item set. */ private void outputItemSetWithAttLiterals(short[] itemSet) { // Check for empty set if (itemSet == null) System.out.print(" null "); // Process else { // Reconvert where input dataset has been reordered and possible // pruned. short[] tempItemSet = reconvertItemSet(itemSet); // Loop through item set elements int counter = 0; for (int index=0;index