/* -------------------------------------------------------------------------- */ /* */ /* TEXT MINING (PRE-PROCESSING) */ /* */ /* Frans Coenen */ /* */ /* Friday 3 February 2006 */ /* (Revissions: 14/3/2006, 31/10/2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Text preprocessing algorithms for use with TFPC. Class structure: AssocRuleMining | +-- TextMining. */ //package lucsKDD_ARM; // Java packages import java.io.*; import java.util.*; // Java GUI packages import javax.swing.*; /** Parent class for phrase and keyword based text mining classes. Contains methods to coordinate keyword/phrase identification from training set of documents, read the training and test sets, recast the training and testset in terms of the identified keywords/phrases and then produce a classifier from the training set tested on the test set. @author Frans Coenen @version 14 Match 2006. */ public class TextMining extends AssocRuleMining { /* ---------------------------------------------------------------- */ /* */ /* FIELDS */ /* */ /* ---------------------------------------------------------------- */ //Constants /** Maximum number of one item sets that can be processed --- (2^15)-1. */ protected static final int MAX_NUM_ONE_ITEMSETS = 32767; // Command line arguments /** Lower noise threshold (%) below which words are considered to be noise. */ protected double lowerNoiseThold = 5.0; /** Upper noise threshold (%) above which words are considered to be noise. */ protected double upperNoiseThold = 50.0; /** Index for identifying significanr words (i.e. those that serve to distinguish between classes). */ protected double significanceIndex = 1.5; // Other Arguments (assigned using set methods). /** File stem */ private String fileStem = null; /** File end */ private String fileEnd = null; /** List of training set document numbers. */ protected short[] trainingSetIDs = null; /** List of test set document numbers. */ protected short[] testSetIDs = null; /** List of possible classes, used for: (i) identifying classes in input data and (ii) output. */ protected String[] docClasses = null; /** Maximum number of significant word allowed. */ protected int maxNumSigWords = 1500; // Objects /** Instance of the class wordBinTree in which to store words. */ protected WordBinTree wordBinTree = null; /** Instance of the class TrainingSetDocumentBase in which to store entire set of training documents. */ protected TrainingSetDocumentBase docBase = null; /** Instance of the class TestSetDocumentBase in which to store individual (one at a time) test set documents. */ protected TestSetDocumentBase testDocBase = null; // Parameters /** Lower support threshold (in terms of number of document) below which words are considered to be insignificant. */ protected int lowerNoiseDocThold = 0; /** Upper support threshold (in terms of number of document) above which words are considered to be insignificant. */ protected int upperNoiseDocThold = 0; /** Number of rows in training set, also not the same as the number of rows in the classification training set. */ protected int numRowsInTrainingSet; // Data structures /** 2-D array to hold the test data
Note that classifiaction involves producing a set of Classification Rules (CRs) from a training set and then testing the effectiveness of the CRs on a test set. */ protected short[][] testDataArray = null; /** Documents per class in training set. */ protected int[] docsPerClass = null; // Other /** The class number of the current document. */ protected short classNumber = 0; // Flags /** Significant word contribution calculation strategy. */ private String sigWordContCalcStrat = null; /** Potential significant word list generation strategy. */ private String potSWlistGenStrat = null; /** Significant word selection strategy. */ private String sigWordSelectStrat = null; // Output Flags /** Indicates request to output number of documents per class in training set. */ protected boolean outputNumDocsPerClassInTrainSetFlag = false; /** Indicates request to output size of documents in training set. */ protected boolean outputSizeOfDocInTrainSetFlag = false; /** Indicates request to output training set in its raw form. */ protected boolean outputTrainSetRawFlag = false; /** Indicates request to output training set in its raw form (1st 10 documents only). */ protected boolean outputTrainSetRaw10flag = false; /** Indicates request to output trainingset in its attribute number form. */ protected boolean outputTrainSetAttNumFlag = false; /** Indicates request to output trainingset in its "marked up" form. */ protected boolean outputTrainSetAttMarkedFlag = false; /** Indicates request to output trainingset in its "marked up" form (1st 10 documents only). */ protected boolean outputTrainSetAttMarked10flag = false; /** Indicates request to output size of documents in test set. */ protected boolean outputSizeOfDocInTestSetFlag = false; /** Indicates request to output test set in its raw form. */ protected boolean outputTestSetRawFlag = false; /** Indicates request to output test set in its attribute number form. */ protected boolean outputTestSetAttNumFlag = false; /** Indicates request to output word bin tree. */ protected boolean outputWordBinTreeFlag = false; /** Indicates request to output word bin tree upper noise words. */ protected boolean outputWordBinTreeUNflag = false; /** Indicates request to output word bin tree lower noise words. */ protected boolean outputWordBinTreeLNflag = false; /** Indicates request to output word bin tree ordinary words. */ protected boolean outputWordBinTreeOWflag = false; /** Indicates request to output word bin tree significant words. */ protected boolean outputWordBinTreeSWflag = false; /** Indicates request to word bin tree statistics. */ protected boolean outputWordBinTreeStatsFlag = false; /** Indicates request to oputput significant words per class. */ protected boolean outputSigWordsPerClassFlag = false; /** Indicates request to oputput potential significant words per class. */ protected boolean outputPotSigWdsPerClassFlag = false; /** Indicates request to oputput the potential significant words list. */ protected boolean outputPotSigWordsListFlag = false; /** Indicates request to oputput the top 10 potential significant words per class. */ protected boolean outputTop10sigWordsListFlag = false; /** Indicates request to output training set statistics. */ protected boolean outputTrainSetStatsFlag = false; /** Indicates request to output test set statistics. */ protected boolean outputTestSetStatsFlag = false; /** Indicates request to output wirds with count of 1. */ protected boolean outputCount1wrdsFlag = false; // Diagnostics /** Number of nodes in word bin tree. */ protected int numNodesInWordBinTree = 0; /** Array (list) of rules in which to store number of times each rule is fired. (used for diagnostic purposes only). Second array has three elements, (i) number of times fired, (ii) number of times correctly fired, (iii) number of times incorrectly fired. */ private short[][] diagRulebaseList; /** ID number of the last rule fired (used for diagnostic purposes only). */ private int diagLastRuleFired; /** Array of arrays describing which rule classified which record in test set.
First array size equivalent to number of records in test set; second
array has two elements: (i) classification class number (may not be same
as actual class), (ii) ID number of rule fired. */
private short[][] diagTestDataArray;
/* ---------------------------------------------------------------- */
/* */
/* CONSTRUCTORS */
/* */
/* ---------------------------------------------------------------- */
/** Constructor processes command line arguments.
@param args the command line arguments (array of String instances). */
public TextMining(String[] args) {
// Process command line arguments
for(int index=0;index Note that a single word may devide into two sub words if (say) it
includes a hythen or an appostraphy.
@param word the given word.
@return number of words in word (may be more than one if stop charcater
found. */
private int getSizeofWordOld(String word) {
int size = 0;
// Process word looking for non letters and punctuation. Index is the
// cell index of the given word (initialised to 0), start index is the
// cell index to the start of a sub-word that may be located within the
// given word (initialised to 0).
int startIndex = 0;
int index = 0;
for(;index Note that stop
marks are indicated by a "null", these are not added to the word bin tree
but are included in the training document base as they are later used to
identify phrases.
@param word the given word identified in the training document set.
@param docIndex the index (docyment number), into the training set
document array of arrays, of the current document number. */
private void trainingSetWordFound(String word, int docIndex) {
WordBinTreeNode node = null;
// If not stop mark
if (word != null) {
word = word.toLowerCase();
node = wordBinTree.addToWordBinTree(word,docIndex,classNumber);
}
// Add to doc base
docBase.addWordToDocInDocBase(docIndex,node);
}
/* -------------------------------- */
/* READ TEST SET DOCUMENT */
/* -------------------------------- */
/* READ TEST SET DOCUMENT */
/** Reads the given document test set text file and process text.
@param docNum the document identification sequential number. */
protected void readTestSetDoc(short docNum) { //throws IOException {
// Generate new BufferedReader object for document text file; fileName
// and fileInput defined in AssocRule Mining parent class, as is method
// openFileName.
fileName = fileStem + docNum + fileEnd;
openFileName(fileName);
// Read file
try {
StringTokenizer dataLine = null;
int numberOfTokens = 0;
// Process
while (true) {
dataLine = new StringTokenizer(fileInput.readLine());
numberOfTokens = dataLine.countTokens();
readTestSetLine(dataLine,numberOfTokens);
}
}
catch (NullPointerException e) {
closeFile();
}
catch (IOException e) {
closeFile();
}
}
/* READ LINE */
/** Processes a line of text from test document file.
@param line the line of text to be processed.
@param numberOfTokens the number of tokens in the line. */
private void readTestSetLine(StringTokenizer dataLine,
int numberOfTokens) {
for (int counter=0;counter Called from text mining model.
@param the perecentage accuarcy. */
public double testClassification() {
int correctClassCounter = 0;
// Check if test data exists, if not return' 0'
if (testDataArray==null) {
System.out.println("WARNING: No test data");
return(0);
}
// Check if any classification rules have been generated, if not
// return'0'
if (startRulelist==null) {
System.out.println("No classification rules generated!");
return(0);
}
// Initailise data collection arrays.
diagInitDiagArrays();
// Loop through test set
int index=0;
for(;index Overides
method in AprioriTFPclass class. DEPRICATED
@param itemset the record to be classified.
@return the classification. */
/* private short classifyRecordDefault(short[] itemSet) {
int ruleCounter = 0;
RuleNode linkRuleNode = startRulelist;
while (true) {
// Default (last) rule, next reference is empty
if (linkRuleNode.next==null) {
diagRulebaseList[ruleCounter][0]++;
diagLastRuleFired = ruleCounter;
return(linkRuleNode.consequent[0]);
}
// Compare antecedents
if (isSubset(linkRuleNode.antecedent,itemSet)) {
diagRulebaseList[ruleCounter][0]++;
diagLastRuleFired = ruleCounter;
return(linkRuleNode.consequent[0]);
}
// Increment parameters
else {
ruleCounter++;
linkRuleNode = linkRuleNode.next;
}
}
} */
/** Outputs list of rules fired array. */
public void diagRulesFired() {
System.out.println("RULES FIRED\n-----------");
numCorrectAndIncorrectFirings();
// Test if rules exist
if (diagRulebaseList!=null) {
System.out.println("Number of rules = " + diagRulebaseList.length +
" (including default rule)");
for (int index=0;index Numbers range from given start to end inclusively.
@param startDocNum the first document number in the given range.
@param endDocNum the last document number in the given range. */
public void addToTrainingSet(int startDocNum, int endDocNum) {
int numNewDocs = endDocNum-startDocNum+1;
// If no previously recorded training set documents then create such a
// set.
if (trainingSetIDs == null) {
trainingSetIDs = new short[numNewDocs];
for (int index=0;index