/* -------------------------------------------------------------------------- */ /* */ /* TRAINING SET DOCUMENT BASE */ /* */ /* Frans Coenen */ /* */ /* Tuesday 20 December 2005 */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /* Class structure SetDocumentBase | +-- Training SetDocumentBase */ //package lucsKDD_ARM; /** Class containing methods to describes a set of training documents in terms of references to words in the bin tree and to generate the phrase bin tree. Does not produce the desired final training set in terms of attribute numbers, this is dome by methods in the PhraseBinTree class. Similarly methods to output the training set (in what ever format) are contained in the PhraseBinTree class. @author Frans Coenen @version 20 December 2005 */ public class TrainingSetDocumentBase extends SetDocumentBase { /* ------------------------------- */ /* */ /* FIELDS */ /* */ /* ------------------------------- */ /** Inner class describing document in terms of word bin tree nodes. */ public class Document { /** List of words */ private WordBinTreeNode[] content = null; /** Class label */ private short classLabel; } /** The document base (array of document objects) describing each document in terms of words. */ public Document[] docBase = null; /** Content index for current document, */ private int contentIndex = 0; /* ------------------------------------ */ /* */ /* CONSTRUCTORS */ /* */ /* ------------------------------------ */ /** One argument constructor. @param numberOfDocs the number of documents in the training set document base. */ public TrainingSetDocumentBase(int numberOfDocs) { // Create instances of class Document. docBase = new Document[numberOfDocs]; for(int index=0;indexProcess each document in turn. @param phraseBinTree instance of the class PhraseBinTree. */ public void genPhraseBinTree_DelSN_ContGO(PhraseBinTree phraseBinTree) { for (int index=0;index Phrase made up of ordinary words and a least one significant word. Phrase delimited by stop mark and/or noise words. @param phraseBinTree instance of the class PhraseBinTree. @param docNum the current document number. @param content the content of the document in terms of a set of references to WordBinTreeNode objects. */ private void genPhraseBinTree_DelSN_ContGO(PhraseBinTree phraseBinTree, int docNum, WordBinTreeNode[] content) { String[] phrase = null; boolean lookingForPhraseStart = true; boolean lookingForPhraseEnd = false; String fstSigWordInPhrase = null; int indexOfFstSigWordInPhrase = -1; // Process document word by word for (int index=0;index=0) { phraseBinTree.addToPhraseBinTree(phrase,docNum, fstSigWordInPhrase,indexOfFstSigWordInPhrase); } lookingForPhraseStart = true; lookingForPhraseEnd = false; fstSigWordInPhrase = null; indexOfFstSigWordInPhrase = -1; } } } // Got to end of document, may have a last phrase if (indexOfFstSigWordInPhrase>=0) phraseBinTree.addToPhraseBinTree(phrase, docNum,fstSigWordInPhrase,indexOfFstSigWordInPhrase); } /** Append a word to a phrase so far. @param phrase the phrase so far. @param newWord the word to be appended. @return the newly concatinated phrase. */ private String[] addToPhrase(String[] phrase, String newWord) { String[] newPhrase; // Empty phrase if (phrase==null) { newPhrase = new String[1]; newPhrase[0] = newWord; } // Not empty else { newPhrase = new String[phrase.length+1]; // Copy int index=0; for (;indexProcess each document in turn. @param phraseBinTree instance of the class PhraseBinTree. */ public void genPhraseBinTree_DelSO_ContGN(PhraseBinTree phraseBinTree) { for (int index=0;index Phrase made up of noise words and a least one significant word. Phrase delimited by stop mark and/or ordinary words. @param phraseBinTree instance of the class PhraseBinTree. @param docNum the current document number. @param content the content of the document in terms of a set of references to WordBinTreeNode objects. */ private void genPhraseBinTree_DelSO_ContGN(PhraseBinTree phraseBinTree, int docNum,WordBinTreeNode[] content) { String[] phrase = null; boolean lookingForPhraseStart = true; boolean lookingForPhraseEnd = false; String fstSigWordInPhrase = null; int indexOfFstSigWordInPhrase = -1; // Process document word by word for (int index=0;index=0) { phraseBinTree.addToPhraseBinTree(phrase,docNum, fstSigWordInPhrase,indexOfFstSigWordInPhrase); } lookingForPhraseStart = true; lookingForPhraseEnd = false; fstSigWordInPhrase = null; indexOfFstSigWordInPhrase = -1; } } } // Got to end of document, may have a last phrase if (indexOfFstSigWordInPhrase>=0) phraseBinTree.addToPhraseBinTree(phrase, docNum,fstSigWordInPhrase,indexOfFstSigWordInPhrase); } /* ------------------------------------------------------------- */ /* GENERATE PHRASE BIN TREE */ /* */ /* DELIMETERS = NOISE WORDS AND STOP MARKS */ /* CONTENT = DISTINGUSHING WORDS AND INCLUDED ORDINARY WORDS */ /* REPRESENTED AS WILD CARDS */ /* IGNORE = EXCLUDED ORDINARY WORDS */ /* */ /* ------------------------------------------------------------- */ /** Commences process of generating phrase bin tree.

Process each document in turn. @param phraseBinTree instance of the class PhraseBinTree. */ public void genPhraseBinTree_DelSN_ContGW(PhraseBinTree phraseBinTree) { for (int index=0;index Phrase made up of wild cards and a least one significant word. Phrase delimited by stop mark and/or noise words. @param phraseBinTree instance of the class PhraseBinTree. @param docNum the current document number. @param content the content of the document in terms of a set of references to WordBinTreeNode objects. */ private void genPhraseBinTree_DelSN_ContGW(PhraseBinTree phraseBinTree, int docNum, WordBinTreeNode[] content) { String[] phrase = null; boolean lookingForPhraseStart = true; boolean lookingForPhraseEnd = false; String fstSigWordInPhrase = null; int indexOfFstSigWordInPhrase = -1; //System.out.println("NEW DOCUMENT\n***********************"); // Process document word by word for (int index=0;indexProcess each document in turn. @param phraseBinTree instance of the class PhraseBinTree. */ public void genPhraseBinTree_DelSO_ContGW(PhraseBinTree phraseBinTree) { for (int index=0;index Phrase made up of wild cards and a least one significant word. Phrase delimited by stop mark and/or ordinary words. @param phraseBinTree instance of the class PhraseBinTree. @param docNum the current document number. @param content the content of the document in terms of a set of references to WordBinTreeNode objects. */ private void genPhraseBinTree_DelSO_ContGW(PhraseBinTree phraseBinTree, int docNum,WordBinTreeNode[] content) { String[] phrase = null; boolean lookingForPhraseStart = true; boolean lookingForPhraseEnd = false; String fstSigWordInPhrase = null; int indexOfFstSigWordInPhrase = -1; //System.out.println("NEW DOCUMENT\n***********************"); // Process document word by word for (int index=0;index=0) { //System.out.print("Add Phrase:"); //for(int i=0;i=0) phraseBinTree.addToPhraseBinTree(phrase, docNum,fstSigWordInPhrase,indexOfFstSigWordInPhrase); } /* ------------------------------------------- */ /* GET AND SET METHODS */ /* ------------------------------------------- */ /** Gets number of documents in document base. @return number of documents in document base. */ public int getSizeOfDocBase() { return(docBase.length); } /** Gets class label for given document. @param docNumber the ID number for the given document. @return the class lebel. */ public short getClassLabel(int docNumber) { return(docBase[docNumber].classLabel); } /** Sets the class label field for the relevant document in the document base. @param docNumber the ID number for the given document. @param label the class label */ public void setClassLabel(int docNumber, short label) { docBase[docNumber].classLabel = label; } /* ----------------------------------- */ /* DIAGNOSTICS */ /* ----------------------------------- */ /** Commences process of outputting document base as a set of phrases. */ public void outputDocBase() { // Process document base for (int index=0;index