/* -------------------------------------------------------------------------- */ /* */ /* TEXT MINING (PRE-PROCESSING) */ /* */ /* Frans Coenen */ /* */ /* Friday 3 February 2006 */ /* (Revissions: 14/3/2006, 31/10/2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Text preprocessing algorithms for use with TFPC. Class structure: AssocRuleMining | +-- TextMining. */ //package lucsKDD_ARM; // Java packages import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Parent class for phrase and keyword based text mining classes. Contains methods to coordinate keyword/phrase identification from training set of documents, read the training and test sets, recast the training and testset in terms of the identified keywords/phrases and then produce a classifier from the training set tested on the test set. @author Frans Coenen @version 14 Match 2006. */ public class TextMining extends AssocRuleMining { /* ---------------------------------------------------------------- */ /* */ /* FIELDS */ /* */ /* ---------------------------------------------------------------- */ //Constants /** Maximum number of one item sets that can be processed --- (2^15)-1. */ protected static final int MAX_NUM_ONE_ITEMSETS = 32767; // Command line arguments /** Lower noise threshold (%) below which words are considered to be noise. */ protected double lowerNoiseThold = 5.0; /** Upper noise threshold (%) above which words are considered to be noise. */ protected double upperNoiseThold = 50.0; /** Index for identifying significanr words (i.e. those that serve to distinguish between classes). */ protected double significanceIndex = 1.5; // Other Arguments (assigned using set methods). /** File stem */ private String fileStem = null; /** File end */ private String fileEnd = null; /** List of training set document numbers. */ protected short[] trainingSetIDs = null; /** List of test set document numbers. */ protected short[] testSetIDs = null; /** List of possible classes, used for: (i) identifying classes in input data and (ii) output. */ protected String[] docClasses = null; /** Maximum number of significant word allowed. */ protected int maxNumSigWords = 1500; // Objects /** Instance of the class wordBinTree in which to store words. */ protected WordBinTree wordBinTree = null; /** Instance of the class TrainingSetDocumentBase in which to store entire set of training documents. */ protected TrainingSetDocumentBase docBase = null; /** Instance of the class TestSetDocumentBase in which to store individual (one at a time) test set documents. */ protected TestSetDocumentBase testDocBase = null; // Parameters /** Lower support threshold (in terms of number of document) below which words are considered to be insignificant. */ protected int lowerNoiseDocThold = 0; /** Upper support threshold (in terms of number of document) above which words are considered to be insignificant. */ protected int upperNoiseDocThold = 0; /** Number of rows in training set, also not the same as the number of rows in the classification training set. */ protected int numRowsInTrainingSet; // Data structures /** 2-D array to hold the test data

Note that classifiaction involves producing a set of Classification Rules (CRs) from a training set and then testing the effectiveness of the CRs on a test set. */ protected short[][] testDataArray = null; /** Documents per class in training set. */ protected int[] docsPerClass = null; // Other /** The class number of the current document. */ protected short classNumber = 0; // Flags /** Significant word contribution calculation strategy. */ private String sigWordContCalcStrat = null; /** Potential significant word list generation strategy. */ private String potSWlistGenStrat = null; /** Significant word selection strategy. */ private String sigWordSelectStrat = null; // Output Flags /** Indicates request to output number of documents per class in training set. */ protected boolean outputNumDocsPerClassInTrainSetFlag = false; /** Indicates request to output size of documents in training set. */ protected boolean outputSizeOfDocInTrainSetFlag = false; /** Indicates request to output training set in its raw form. */ protected boolean outputTrainSetRawFlag = false; /** Indicates request to output training set in its raw form (1st 10 documents only). */ protected boolean outputTrainSetRaw10flag = false; /** Indicates request to output trainingset in its attribute number form. */ protected boolean outputTrainSetAttNumFlag = false; /** Indicates request to output trainingset in its "marked up" form. */ protected boolean outputTrainSetAttMarkedFlag = false; /** Indicates request to output trainingset in its "marked up" form (1st 10 documents only). */ protected boolean outputTrainSetAttMarked10flag = false; /** Indicates request to output size of documents in test set. */ protected boolean outputSizeOfDocInTestSetFlag = false; /** Indicates request to output test set in its raw form. */ protected boolean outputTestSetRawFlag = false; /** Indicates request to output test set in its attribute number form. */ protected boolean outputTestSetAttNumFlag = false; /** Indicates request to output word bin tree. */ protected boolean outputWordBinTreeFlag = false; /** Indicates request to output word bin tree upper noise words. */ protected boolean outputWordBinTreeUNflag = false; /** Indicates request to output word bin tree lower noise words. */ protected boolean outputWordBinTreeLNflag = false; /** Indicates request to output word bin tree ordinary words. */ protected boolean outputWordBinTreeOWflag = false; /** Indicates request to output word bin tree significant words. */ protected boolean outputWordBinTreeSWflag = false; /** Indicates request to word bin tree statistics. */ protected boolean outputWordBinTreeStatsFlag = false; /** Indicates request to oputput significant words per class. */ protected boolean outputSigWordsPerClassFlag = false; /** Indicates request to oputput potential significant words per class. */ protected boolean outputPotSigWdsPerClassFlag = false; /** Indicates request to oputput the potential significant words list. */ protected boolean outputPotSigWordsListFlag = false; /** Indicates request to oputput the top 10 potential significant words per class. */ protected boolean outputTop10sigWordsListFlag = false; /** Indicates request to output training set statistics. */ protected boolean outputTrainSetStatsFlag = false; /** Indicates request to output test set statistics. */ protected boolean outputTestSetStatsFlag = false; /** Indicates request to output wirds with count of 1. */ protected boolean outputCount1wrdsFlag = false; // Diagnostics /** Number of nodes in word bin tree. */ protected int numNodesInWordBinTree = 0; /** Array (list) of rules in which to store number of times each rule is fired. (used for diagnostic purposes only). Second array has three elements, (i) number of times fired, (ii) number of times correctly fired, (iii) number of times incorrectly fired. */ private short[][] diagRulebaseList; /** ID number of the last rule fired (used for diagnostic purposes only). */ private int diagLastRuleFired; /** Array of arrays describing which rule classified which record in test set.

First array size equivalent to number of records in test set; second array has two elements: (i) classification class number (may not be same as actual class), (ii) ID number of rule fired. */ private short[][] diagTestDataArray; /* ---------------------------------------------------------------- */ /* */ /* CONSTRUCTORS */ /* */ /* ---------------------------------------------------------------- */ /** Constructor processes command line arguments. @param args the command line arguments (array of String instances). */ public TextMining(String[] args) { // Process command line arguments for(int index=0;index Confidence and support not used in text preprocessing but form part of command line argument as required later. Passed to TFPC using set method. @param argument the given argument. */ protected void idArgument(String argument) { if (argument.length()<3) { System.out.println("ERROR: Command line argument '" + argument + "' too short."); errorFlag = false; } else if (argument.charAt(0) == '-') { char flag = argument.charAt(1); argument = argument.substring(2,argument.length()); switch (flag) { case 'C': confidence = Double.parseDouble(argument); break; case 'G': significanceIndex = Double.parseDouble(argument); break; case 'L': lowerNoiseThold = Double.parseDouble(argument); break; case 'M': maxNumSigWords = Integer.parseInt(argument); break; case 'S': support = Double.parseDouble(argument); break; case 'U': upperNoiseThold = Double.parseDouble(argument); break; default: System.out.println("ERROR: Unrecognise command line " + " argument: '" + flag + argument + "'."); errorFlag = false; } } else { System.out.println("ERROR: All command line arguments must " + "commence with a '-' character ('" + argument + "')."); errorFlag = false; } } /* CHECK INPUT ARGUMENTS */ /** Invokes methods to check values associated with command line arguments (overides higher level method). */ protected void CheckInputArguments() { // Check support and confidence input checkSupportAndConfidence(); // Check file name checkThresholds(); // Return if (errorFlag) outputSettings(); else outputMenu(); } /* CHECK THRESHOLDS */ /** Checks lower and upper noise threshold values: must all be between 100.0 and 0.0, and upper threshold must be above lower threshold. If this is not the case then errorFlag set to false. */ protected void checkThresholds() { // Check lower noise threshold if ((lowerNoiseTholdMAX_SUPPORT)) { System.out.println("INPUT ERROR: Lower noise threshold (" + lowerNoiseThold + ") must be specified as a " + "percentage (" + MIN_SUPPORT + " - " + MAX_SUPPORT + ")"); errorFlag = false; } // Check upper noise threshold if ((upperNoiseTholdMAX_SUPPORT)) { System.out.println("INPUT ERROR: Upper noise threshold (" + upperNoiseThold + ") must be specified as a " + "percentage (" + + MIN_SUPPORT + " - " + MAX_SUPPORT + ")"); errorFlag = false; } // Check lower noise threshold below upper noise threshold if (upperNoiseThold<=lowerNoiseThold) { System.out.println("INPUT ERROR: Upper noise threshold (" + upperNoiseThold + ") must be greater than " + "lower noise threshold (" + lowerNoiseThold + ")"); errorFlag = false; } } /* ------------------------------------------------------ */ /* TOP LEVEL METHODS */ /* ------------------------------------------------------ */ /** Commences process of loading training document base and storing it in a word bin tree and an array of array of references to nodes in the word bin tree. */ protected void loadTrainingSetDocumentBase() throws IOException { System.out.println("READ TRAINING SET"); System.out.println("================="); if (trainingSetIDs==null) { System.out.println("ERROR: No training set!"); System.exit(1); } // Calculate lower and upper noise thresholds, and minimum support // value, in terms of number of documents in training set. The numRows // and minSupport fields are defined in the AssocRuleMining class numRowsInTrainingSet = trainingSetIDs.length; numRows = numRowsInTrainingSet; minSupport = (numRows*support)/100.0; lowerNoiseDocThold = (int) (lowerNoiseThold*numRowsInTrainingSet/100); upperNoiseDocThold = (int) (upperNoiseThold*numRowsInTrainingSet/100); // Define document base (training set) docBase = new TrainingSetDocumentBase(numRowsInTrainingSet); System.out.println("Num. docs. in training set = " + numRowsInTrainingSet + "\n"); // loop through document base and create word bin tree. for(int index=0;index Note that a word ending on a stop mark counts as two words. A word containing supurious charcaters and a stop mark counts as one word (the stop mark). Otherwise the word is ignored. @param word the given word. @return number of words in word (may be more than one if stop charcater found. */ private int getSizeofWord(String word) { int size = 0; // Get last character and check if stop mark int lastIndex = word.length()-1; int asciiCode = (int) word.charAt(lastIndex); if (isStopMark(asciiCode)) { // If all other charcters are alpahbetic than word followed by stop // mark. if (allAlphaChars(word.substring(0,lastIndex))) size = size + 2; // Otherwise stop mark only else size++; } // Otherwise word or stop mark. Note that tom's would be defined as a // stop mark. else size++; // Return return(size); } /** DEPRECATED --- Process word looking for non letters and punctuation.

Note that a single word may devide into two sub words if (say) it includes a hythen or an appostraphy. @param word the given word. @return number of words in word (may be more than one if stop charcater found. */ private int getSizeofWordOld(String word) { int size = 0; // Process word looking for non letters and punctuation. Index is the // cell index of the given word (initialised to 0), start index is the // cell index to the start of a sub-word that may be located within the // given word (initialised to 0). int startIndex = 0; int index = 0; for(;indexOperates by looping through the tokens. For each token: If word starts '@' then found start of header line If first token is "@class" then found class header --- identify class, set class label in training set documents structure, increment diagnostic documents per class counter. Breakand ignire either entire header if not class header or rest of class header. Else process word. @param dataLine the line of text to be processed in the form of a sequence of tokens. @param numberOfTokens the number of tokens in the given line of text. @param docIndex the index, into the training set documents array of arrays, of the current document number. */ private void readLine(StringTokenizer dataLine, int numberOfTokens, int docIndex) { for (int counter=0;counter If the word ends in a stop mark: If all other characters are alpahbetic than gound word followed by stop mark. Else found stop mark only. Else If all charcters are alpahbetic than found word. Else found stop mark (for exampl tom's would be interprerted as a stop mark). @param word the given word. @param docIndex the index, into the training set document array of arrays, of the current document number. */ private void parseWord(String word, int docIndex) { // Get last character and check if stop mark int lastIndex = word.length()-1; int asciiCode = (int) word.charAt(lastIndex); if (isStopMark(asciiCode)) { // If all other charcters are alpahbetic than word followed by stop // mark. if (allAlphaChars(word.substring(0,lastIndex))) { trainingSetWordFound(word.substring(0,lastIndex),docIndex); // Otherwise add stop mark only. trainingSetWordFound(null,docIndex); } // Otherwise stopmark only else trainingSetWordFound(null,docIndex); } else { // If all charcters are alpahbetic than word. if (allAlphaChars(word)) trainingSetWordFound(word,docIndex); // Otherwise stop mark else trainingSetWordFound(null,docIndex); } } /** Converts word found in training set to lower case and then updates the word bin tree and the training set document base.

Note that stop marks are indicated by a "null", these are not added to the word bin tree but are included in the training document base as they are later used to identify phrases. @param word the given word identified in the training document set. @param docIndex the index (docyment number), into the training set document array of arrays, of the current document number. */ private void trainingSetWordFound(String word, int docIndex) { WordBinTreeNode node = null; // If not stop mark if (word != null) { word = word.toLowerCase(); node = wordBinTree.addToWordBinTree(word,docIndex,classNumber); } // Add to doc base docBase.addWordToDocInDocBase(docIndex,node); } /* -------------------------------- */ /* READ TEST SET DOCUMENT */ /* -------------------------------- */ /* READ TEST SET DOCUMENT */ /** Reads the given document test set text file and process text. @param docNum the document identification sequential number. */ protected void readTestSetDoc(short docNum) { //throws IOException { // Generate new BufferedReader object for document text file; fileName // and fileInput defined in AssocRule Mining parent class, as is method // openFileName. fileName = fileStem + docNum + fileEnd; openFileName(fileName); // Read file try { StringTokenizer dataLine = null; int numberOfTokens = 0; // Process while (true) { dataLine = new StringTokenizer(fileInput.readLine()); numberOfTokens = dataLine.countTokens(); readTestSetLine(dataLine,numberOfTokens); } } catch (NullPointerException e) { closeFile(); } catch (IOException e) { closeFile(); } } /* READ LINE */ /** Processes a line of text from test document file. @param line the line of text to be processed. @param numberOfTokens the number of tokens in the line. */ private void readTestSetLine(StringTokenizer dataLine, int numberOfTokens) { for (int counter=0;counterTestSetDocumentBase @param word the given word. */ private void parseTestWord(String word) { // Get last character and check if stop mark int lastIndex = word.length()-1; int asciiCode = (int) word.charAt(lastIndex); if (isStopMark(asciiCode)) { // If all other charcters are alpahbetic than word followed by stop // mark. if (allAlphaChars(word.substring(0,lastIndex))) { testSetWordFound(word.substring(0,lastIndex)); // Otherwise add stop mark only. testDocBase.addWord(null); } else testDocBase.addWord(null); } else { // If all charcters are alpahbetic than word. if (allAlphaChars(word)) testSetWordFound(word); // Stop mark else testDocBase.addWord(null); } } /** Converts word found in test set to lower case and then adds to the tese set document base. @param word the given word identified in the test document set. */ private void testSetWordFound(String word) { // Convert to lower case add to test documnet base testDocBase.addWord(word.toLowerCase()); } /* ------------------------------------------ */ /* DIAGNOSTIC CLASSIFICATION METHODS */ /* ------------------------------------------ */ /* Methods that carry out the classification of the the test set with the inclussion of diagnostic data collection --- overide methods in parent class. */ /* TEST CLASSIFICATION */ /** Tests the generated classification rules using test set and returns percentage accuracy.

Called from text mining model. @param the perecentage accuarcy. */ public double testClassification() { int correctClassCounter = 0; // Check if test data exists, if not return' 0' if (testDataArray==null) { System.out.println("WARNING: No test data"); return(0); } // Check if any classification rules have been generated, if not // return'0' if (startRulelist==null) { System.out.println("No classification rules generated!"); return(0); } // Initailise data collection arrays. diagInitDiagArrays(); // Loop through test set int index=0; for(;index Copy of method in AprioriTclass. @param itemset the record to be classified. @return the classification. */ protected short classifyRecordDefault(short[] itemSet) { return(classifyRecordDefault(itemSet,startRulelist)); } /** Continues process of earching through rule data looking for a rule antecedent which is a subset of the input set or the default rule (last rule), when found returns the class and notes which rule was fired for which test case. @param itemset the record to be classified. @param node the currentNode. @return the classification. */ protected short classifyRecordDefault(short[] itemSet, RuleNode node) { // Process node if (node != null) { // Left branch short consClass = classifyRecordDefault(itemSet,node.leftBranch); if (consClass!=0) return(consClass); // Node, Check if default (last) rule or compare antecedents. if ((node.ruleNumber==numRules) || (isSubset(node.antecedent,itemSet))) { diagRulebaseList[node.ruleNumber][0]++; diagLastRuleFired = node.ruleNumber; return(node.consequent[0]); } // Right branch return(classifyRecordDefault(itemSet,node.rightBranch)); } // Return return(0); } /* CLASSIFY RECORD (CSA ORDERING AND DEFAULT RULE) */ /** Searches through rule data looking for a rule antecedent which is a subset of the input set or the default rule (last rule), when found returns the class and notes which rule was fired for which test case.

Overides method in AprioriTFPclass class. DEPRICATED @param itemset the record to be classified. @return the classification. */ /* private short classifyRecordDefault(short[] itemSet) { int ruleCounter = 0; RuleNode linkRuleNode = startRulelist; while (true) { // Default (last) rule, next reference is empty if (linkRuleNode.next==null) { diagRulebaseList[ruleCounter][0]++; diagLastRuleFired = ruleCounter; return(linkRuleNode.consequent[0]); } // Compare antecedents if (isSubset(linkRuleNode.antecedent,itemSet)) { diagRulebaseList[ruleCounter][0]++; diagLastRuleFired = ruleCounter; return(linkRuleNode.consequent[0]); } // Increment parameters else { ruleCounter++; linkRuleNode = linkRuleNode.next; } } } */ /** Outputs list of rules fired array. */ public void diagRulesFired() { System.out.println("RULES FIRED\n-----------"); numCorrectAndIncorrectFirings(); // Test if rules exist if (diagRulebaseList!=null) { System.out.println("Number of rules = " + diagRulebaseList.length + " (including default rule)"); for (int index=0;index0) System.out.println("Rule " + (index+1) + ": " + diagRulebaseList[index][0] + " (+" + diagRulebaseList[index][1] + ",-" + diagRulebaseList[index][2] + ")"); } System.out.println("All other rules unfired.\n"); } // End System.out.println(); } /** Determine number of correctly fored rules and number of incorrectly fired rules. */ private void numCorrectAndIncorrectFirings() { // Test if rules exist if (diagRulebaseList==null) System.out.println("No rules generated!"); // Process diagnostic test data array else { for (int index=0;indexThe ID number is assigned to classNumber field. Note that a class label may comprise seveeral tokens which adds an additional complication to this and the realted methods. @param classLine The classification line from the text file. @param numberOfTokens the number of tokens in the line. */ private void findClassLabel(StringTokenizer classLine, int numberOfTokens) { // Build document class label from one or more tokens String classification = null; for (int index=0;index=docClasses.length) { String s = "Class Error: unrecoginsied class \"" + classification + "\" found in input data set!"; JOptionPane.showMessageDialog(this,s,"Class Error: ", JOptionPane.ERROR_MESSAGE); index=0; } // Set class number classNumber = (short) index; } /* GET CLASS LITERAL */ /** Get the class literal for a given class number. @param classNum the given class index. @return the class liteeral associated with the given class number. */ public String getClassLiteral(short classNum) { // Get index for class int classIndex = (int) classNum-numOneItemSets+docClasses.length-1; if (classIndex<0 || classIndex>=docClasses.length) { System.out.println("ERROR: class number " + classIndex + " not found"); return("Error"); } else return("{" + docClasses[classIndex] + "}"); } /* ---------------------------------------------------------------- */ /* */ /* SET METHODS */ /* */ /* ---------------------------------------------------------------- */ /** Sets the file stem and file end fields to the given values. @param stem the file stem value. @param end the file end fvalue. */ public void setFileStemAndEnd(String stem, String end) { fileStem = stem; fileEnd = end; } /** Adds a sequence of document ID numbers to the list of training set IDs.

Numbers range from given start to end inclusively. @param startDocNum the first document number in the given range. @param endDocNum the last document number in the given range. */ public void addToTrainingSet(int startDocNum, int endDocNum) { int numNewDocs = endDocNum-startDocNum+1; // If no previously recorded training set documents then create such a // set. if (trainingSetIDs == null) { trainingSetIDs = new short[numNewDocs]; for (int index=0;indexNumbers range from given start to end inclusively. @param startDocNum the first document number in the given range. @param endDocNum the last document number in the given range. */ public void addToTestSet(int startDocNum, int endDocNum) { int numNewDocs = endDocNum-startDocNum+1; // If no previously recorded test set documents. */ if (testSetIDs == null) { testSetIDs = new short[numNewDocs]; for (int index=0;index called from application. @param classList the given list of classes for the application. */ public void setDocClasses(String[] cl) { // Dimension docClasses array numClasses = cl.length; docClasses = new String[numClasses]; // Process given class list for (int index=0;index