/* -------------------------------------------------------------------------- */ /* */ /* TEXT MINING MODEL */ /* Frans Coenen */ /* Tuesday 28 February 2006 */ /* (Revisions: 15 March 2006, 20 March 2006, 8 Novrmber 2006) */ /* */ /* */ /* -------------------------------------------------------------------------- */ /* Experimental image minining GUI model for text Mining. Algorithm is as follows: startTextClassification\0: if (batch mode) { startTextClassificationBM\0: LOOP: through batch S, C and G values startTextClassificationBM\3: LOOP: through batch UNT and LNT values startTextClassificationBM\5: Set parameters startTextClassification3\0: END LOOP: END LOOP: } else if (setOfEifghtMode) { LOOP: through eight significant word strategies startTextClassification2\0: Set parameters startTextClassification3\0: END LOOP: } else { startTextClassification2\0: Set parameters startTextClassification3\0: } startTextClassification3\0: if (90:10 or 50:50 split) { startMiningRatioSplit\2: Create instance of phrase/keyword mining class Set file specification Define training and test sets according to ratio (output option) startMining\0: (Start phrase or key word mining) } if (TCV) { tenCrossEvaluation\0: Determine TCV split LOOP: through tenths Define training and test sets (output option) startMining\0: (Start phrase or key word mining) END LOOP: output TCV parameters array } */ //package lucsKDD_ARM; // Java packages import java.awt.*; import java.awt.event.*; import javax.swing.*; import javax.swing.event.*; import java.io.*; import java.util.*; public class TextMiningModel extends JFrame { /* ------ FIELDS ------ */ // CONTROL CONSTANTS /** Availabel significant word contribution calculation strategies. */ private final static String[] SW_CONTRIB_CALC_STRATS = {"Word Frequency", "Word Support"}; /** Availabel potential significant word list generation strategies. */ private final static String[] PSWL_GEN_STRATS = {"Unique Sig. Words", "All Significant Words"}; /** Availabel significant word identification strategies. */ private final static String[] SW_ID_STRATS = {"First N (distributed)", "First N",}; /** Available text mining algorithms. */ private final static String[] TM_ALGORITHMS = {"DelSN_contGO", "DelSN_contGW","DelSO_contGN","DelSO_contGW", "Keywords"}; /** Available evaluation menu options. */ private final static String[] EVAL_OPTIONS = {"90:10","50:50","TCV"}; /** Available ordering menu options. */ private final static String[] SAT_STRAT_OPTIONS = {"CSA","CSA best K"}; /** Available training set output menu options.
"Training set IDs" controlled from this class, remainder need to be set in target class. */ private final static String[] TRAINING_SET_OUT_OPS = {"Training set doc IDs","Training set doc sizes", "Training set raw","Training set raw 1st 10", "Training set marked up","Training set marked up 1st 10", "Train. set Phrases/K'words","Train. set Phrases/K'words 1st 10", "Training set attribute #","Training set Docs. per class", "Training Set Stats"}; /** Available test set output menu options.
"Test set IDs" controlled from this class, remainder need to be set in target class. */ private final static String[] TEST_SET_OUT_OPS = {"Test set doc IDs", "Test set doc sizes","Test set raw", "Test set Phrases/Keywords","Test set attribute #", "Test Set Stats"}; /** Available word bin tree output menu options. */ private final static String[] WORD_BIN_TREE_OUT_OPS = {"Word Bin Tree", "Lower Noise Words","Upper Noise Words", "Ordinary Words","Word Bin Tree Stats", "Words with Count 1"}; /** Available word bin tree output menu options. */ private final static String[] SIG_WORD_OUT_OPS = {"Significant Words", "# pot sig words per class","# sig words per class", "Potential Sig. Words","Top 10 Sig. Wds. per Class"}; /** Available phrase bin tree output menu options. */ private final static String[] PHRASE_BIN_TREE_OUT_OPS = {"Phrase Bin Tree", "Phrase Bin Tree Stats","Phrase list","Phrase list 1st 100"}; /** Available TFPC output menu options all controlled from this class. */ private final static String[] TFPC_OUT_OPS = {"T-tree Stats", "P & T tree Storage","Large Itemsets","Ttree","Ptree"}; /** Available classification output menu options.
"Test set IDs" controlled from this class, remainder need to be set in target class.*/ private final static String[] CLASSIFIER_OUT_OPS = {"Classifier Stats.","Classifier", "Rules fired","Classification"}; // SETTINGS /** Array of threshold values ordered as follows: (a) Support, (b) Confidence, (c) Signifcance, (d) Upper Noise and (e) Lower Noise. */ private double[] tHoldValues = {0.1,35.0,3.0,7.0,0.2}; /** 2-D Array of batch mode parameter values.
First dimension ordered as follows: (a) Support, (b) Confidence, (c) Significance, (d) Upper Noise and (e) Lower Noise. Second dimension ordered as follows: (a) from, (b) to and (c) step. */ private double[][] batchValues = {{0.1,0.2,0.5},{20.0,55.0,5.0}, {3.0,4.0,1.0},{7.0,8.0,10.0},{0.2,0.3,1.0}}; /** Array of training set output settings. */ private boolean[] trainingSetOutSettings = {false,false,false,false,false, false,false,false,false,false,false}; /** Array of test set output settings. */ private boolean[] testSetOutSettings = {false,false,false,false,false, false}; /** Array of word bin tree output settings. */ private boolean[] wordBinTreeOutSettings = {false,false,false,false,false, false}; /* Array of significant word output settings. */ private boolean[] sigWordOutSettings = {false,false,false,false,false}; /** Array of phrase bin tree output settings. */ private boolean[] phraseBinTreeOutSettings = {false,false,false,false}; /** Array of TFPC output settings. */ private boolean[] tfpcOutSettings = {false,false,false,false,false}; /** Array of classifier output settings. */ private boolean[] classifierOutSettings = {false,false,false,false}; /** Maximum number of significant word allowed. */ private int maxNumSigWords = 1500; /** File stem. */ private String fileStem = null; /** File end. */ private String fileEnd = null; /** Number of documents (records) in the text base. */ private int numOfDocs = 0; /** List of possible classes */ private String[] classList = null; /** Significance contribution calculation strategy to be used. */ private String sigContribCalcStrat = null; /** Potential significan word list generation strategy to be used. */ private String potSWlistGenStrat = null; /** Significant word ID strategy to be used. */ private String sigWordSelectStrat = null; /** Text miming algorithm to be used. */ private String tmAlgorithm = null; /** Text mining satisfaction strategy to be used. */ private String tmSatStrat = null; /** Text mining evluation strategy to be used. */ private String tmEvalStrat = null; /** String of arguments. */ private String[] args = new String[5]; // Instance fields /** Instance of class KeyWordMining where appropriate. */ private KeyWordMining newKeyWordMining = null; /** Instance of class PhraseMining where appropriate. */ private PhraseMining newPhraseMining = null; /** Instance of class TextMining (super class of both PhraseMining and KeyWordMining). */ private TextMining newTextMining = null; /** Instance of class AprioriTFP_CRgen where appropriate. */ private AprioriTFP_CRgen newAprioriTFPC = null; // TCV /** 2-D array to hold final accuracy and number of CRs values for each "tenth" on completion of AprioriTFP-CR algorithm. */ private double[][] parameters = new double[10][2]; /** Current TCV index. */ private int tcvIndex = 0; // BATCH MODE FIELDS /** Batch mode counter. */ private int batchCounter=0; // FLAGS /** Phrase mining flag. */ private boolean phraseMiningFlag = false; /** Keyword mining flag. */ private boolean keyWordMiningFlag = false; /** TCV flag */ private boolean tcvFlag = false; /** Batch mode flag, used to run a sequence of experiemnents with a range of support, confidence, significance, lower noise and upper noise thresholds. */ private boolean batchModeFlag = false; /** Set of eight mode flag, use to run a set of eight experiments all with the same support, confidence, significance, lower noise and upper noise thresholds, but using the eight different significant word identification strategies. */ private boolean setOfEightFlag = false; /* --------------------------------------------------- */ /* */ /* CONSTRUCTORS */ /* */ /* --------------------------------------------------- */ /** Zero argument constructor. */ public TextMiningModel() { } /* ---------------------------------------------- */ /* */ /* METHODS */ /* */ /* ---------------------------------------------- */ /** Start classification process.
Simply outputs banner and determines whether batch mode is required or not. */ public void startTextClassification() throws IOException { System.out.println("**********************"); System.out.println("*\n* START TEST RUN\n*"); System.out.println("**********************"); // Determibne if batch mode or not if (batchModeFlag) startTextClassificationBM(); else if (setOfEightFlag) startTextClasssetOf8(); else startTextClassification2(); System.out.println("********** END ************"); } /** Continiues proces of starting classification process in single mode (not batch mode).
Contrives a sequence of input arguments and then
continues classification. */
public void startTextClassification2() throws IOException {
// Create args string
args = new String[6];
args[0] = "-S" + tHoldValues[0];
args[1] = "-C" + tHoldValues[1];
args[2] = "-G" + tHoldValues[2];
args[3] = "-U" + tHoldValues[3];
args[4] = "-L" + tHoldValues[4];
args[5] = "-M" + maxNumSigWords;
// Continue processing according to evaluation criteria.
startTextClassification3();
// Reset
reset();
}
/** Resetsa falgs and instance references to false and null as appropriate.
*/
private void reset() {
// Reset flags
phraseMiningFlag = false;
keyWordMiningFlag = false;
tcvFlag = false;
batchModeFlag = false;
setOfEightFlag = false;
// Reset instance references
newPhraseMining = null;
newKeyWordMining = null;
newTextMining = null;
newAprioriTFPC = null;
}
/** Continue text classification processing, both single mode and batch
mode, according to the indicated classification validation criteria. */
public void startTextClassification3() throws IOException {
double time1;
// Loop through list of available training-test set splits
int index=0;
for ( ;index