/* -------------------------------------------------------------------------- */ /* */ /* LUCS-KDD DATA DISCRETISATION/NORMALISATION */ /* (DATA PREPARATION GUI FOR DATA MINING) */ /* */ /* Frans Coenen */ /* */ /* Monday 15 June 2003 */ /* (Revised 17 Feb 2004, 14 October 2004, 17 February 2006, */ /* 28 February 2006, 1 May 2006) */ /* */ /* Department of Computer Science */ /* The University of Liverpool */ /* */ /* -------------------------------------------------------------------------- */ /** The LUCS-KDD (Liverpool University Computer Science - Knowledge Discocery in Data) DN (discretization/normalisation) software has been developed to convert data files available in the UCI data repository (http://www.ics.uci.edu/~mlearn/ MLRepository.html) into a binary format suitable for use with Association Rule Mining (ARM) and Classigication Association Rule Mining (CARM) software. The software can, of course, equally well be used to convert data files obtained from other sources. */ import java.io.*; import java.util.*; import java.awt.*; import java.awt.event.*; import javax.swing.*; // Other packages public class LUCS_KDD_DN extends JFrame implements ActionListener { /* ------ FIELDS ------ */ /* Data Structures */ /* ------ SCHEMA CLASS ------ */ /** Inner class defining a linked list of records describing the input schema. */ private class InputSchema { /** Label for schema item. */ String name; /** Tye of schema item: 'int', 'double', 'nominal', 'unused'. */ String typeName; /** Minimum value for numeric ('int' or 'double') schema items. */ double min=0; /** Maximum value for numeric ('int' or 'double') schema items. */ double max=0; /** Range (maximum-minimum) value for numeric ('int' or 'double') schema items. */ double range=0; /** list of nominal values in the case nominal schema items, set to null otherwise. */ String [] nominalData; /** Number of divisions. */ int numDivisions; /** Array of numerical divisions for output. */ double [] divisionMarkers; /** Array of array of nominal values for output. */ int [][] nominalDivs; /* DEFAULT CONSTRUCTOR */ /** Default construcor. */ private InputSchema() { } /* COPY CONSTRUCTOR */ /** Creates a copy of the given schema item. @param oldInputSchema the given schema item. */ private InputSchema(InputSchema oldInputSchema) { name = oldInputSchema.name; typeName = oldInputSchema.typeName; min = oldInputSchema.min; max = oldInputSchema.max; range = oldInputSchema.range; if (oldInputSchema.nominalData!=null) { nominalData = new String[oldInputSchema.nominalData.length]; for (int index=0;index A numeric division is a subrange. */ private class Division { /** Start Index */ int startIndex; /** End Index */ int endIndex; /** Number of elements in division */ int length; /** Dominant class */ int dominantClass; /** Number of sample records per class. */ int numRecords[]; /** Total number of records */ int totalNumRecords; /** Probability per class */ double probability[]; /** Next link */ Division linkRef = null; /** Back link */ Division backRef = null; /* CONSTRUCTOR */ /** Four argument constructor @param sIndex the start index. @param eIndex the end index. @param domClass the dominant class @param temp temporary array containing number of records per class. */ private Division(int sIndex, int eIndex, int domClass, int[] temp) { startIndex = sIndex; endIndex = eIndex; length = eIndex-sIndex+1; dominantClass = domClass; // Assign to number of records field and determine total number // of records numRecords = new int[numClasses]; int total = 0; for (int i=0;ihasDataFlag to "true". @param discriminator the separater between numbers, e.g. spaxce or comma. */ private void readDataFile(String discriminator) { try { // Set appropriate flags to false hasMaxRangeFlag = false; hasNormalisationFlag = false; // If file OK if (countNumRecordsAndCols(discriminator)) { // Dimension input data array inputDataArray = new double[numRows][]; // Read file if (inputDataSet(discriminator)) { // Set have data flag to true hasDataFlag = true; listParameters2(); } } else { hasDataFlag = false; return; } } catch(IOException ioException) { JOptionPane.showMessageDialog(this,"Error reading File", "Error: ",JOptionPane.ERROR_MESSAGE); closeInputFile(); System.exit(1); } } /* COUNT NUMBER OF RECORDS AND COLUMNS */ /** Reads the input file and dertermines the number of records and columns @param discriminator the separater between numbers, e.g. spaxce or comma. @return false if file is not in correct format or not of correct size. */ private boolean countNumRecordsAndCols(String discriminator) throws IOException { int counter = 0; int localNumCols =0 ; boolean firstLine = true; // Open the file openInputFile(); // Get first row and check discriminator, if not of correct type // rerturn false String line = fileInput.readLine(); if (line.indexOf(discriminator.charAt(0)) < 0) { if (discriminator.equals(" ")) JOptionPane.showMessageDialog(null, "File is not space separated\n\n" ); if (discriminator.equals(",")) JOptionPane.showMessageDialog(null, "File is not Comma separated\n\n" ); closeInputFile(); return(false); } // Loop through file incrementing record counter. while (line != null) { StringTokenizer dataLine = new StringTokenizer(line,discriminator); int numberOfTokens = dataLine.countTokens(); if (firstLine) { localNumCols=numberOfTokens; firstLine=false; } if (numberOfTokens == 0) break; counter++; line = fileInput.readLine(); } // Check number of columns with that in schema file (if it exists) if (numColsInInputSchema != localNumCols) { JOptionPane.showMessageDialog(null, "ERROR: Number of columns in input data \n(" + localNumCols + ") is not the same as in the schema " + "file (" + numColsInInputSchema + ")\n"); numRows = counter; numColsInInputData = localNumCols; closeInputFile(); return(false); } // Set number of rows and columns and close file numRows = counter; numColsInInputData = localNumCols; closeInputFile(); return(true); } /* INPUT DATA SET */ /** Reads input data from file specified in command line argument and places data in inputDataArray. @param discriminator the separater between numbers, e.g. spaxce or comma. @return true if data successfully loaded, false otherwise. */ public boolean inputDataSet(String discriminator) throws IOException { int rowIndex=0; // Open the file openInputFile(); // get first row. String line = fileInput.readLine(); // Loop until empty line found while (line != null) { // Create a StringTokienizer instance StringTokenizer dataLine = new StringTokenizer(line,discriminator); // Get number of tokens int numberOfTokens = dataLine.countTokens(); // If no tokens jump out of loop if (numberOfTokens == 0) break; // If number of tokens greater than that in schema file error if (numColsInInputSchema != numberOfTokens) { textArea.append("ERROR: in line " + (rowIndex+1) + " Number of columns (" + numberOfTokens + ") not the same as in the schema " + "file (" + numColsInInputSchema + ")\n"); return(false); } // Otherwise convert input string to a sequence of doubles double[] code = doubleConversion(rowIndex+1,dataLine, numberOfTokens); // Check for unrecognised nominal value error if (unrecognisedNominalValue) { textArea.append("ABORTING READ OPERATION!"); unrecognisedNominalValue=false; closeInputFile(); return(false); } // Check for "null" input if (code!=null) { // Dimension rows in 2-D inputDataArray int codeLength = code.length; inputDataArray[rowIndex] = new double[codeLength]; // Assign to elements in row for (int colIndex=0;colIndex Operates only when bith scema and data exist. Also called when a column is moved or deleted. */ private void determinMaxMinDataArray() { if (!hasInputSchemaFlag) return; // Initialise min amd max fields in scheam array to missing // term value for(int index=0;index TEST_TERM) { schema[colN].min = inputDataArray[rowN][colN]; schema[colN].max = inputDataArray[rowN][colN]; } else { if (inputDataArray[rowN][colN] < schema[colN].min) schema[colN].min = inputDataArray[rowN][colN]; if (inputDataArray[rowN][colN] > schema[colN].max) schema[colN].max = inputDataArray[rowN][colN]; } } } } } // Determine ranges determineRanges(); } /* RECALCULATE NUMBER OF MISSING ITEMS */ /** Recalculates the number of missing items in the input data set (used when a column has been removed. */ private void recalculateMissingItems() { numMissingItems=0; // Loop through data array for(int index1=0;index1TEST_TERM) numMissingItems++; } } } /* ------------------------------------------------------- */ /* */ /* INPUT INPUT SCHEMA */ /* */ /* ------------------------------------------------------- */ /* INPUT INPUT SCHEMA */ /** Commences process of inputting inpurt data schema, if schema not read (for whatever reason) sets hasInputSchemaFlag value to false. */ private void inputInputSchema() { textArea.append("READING INPUT SCHEMA:\n-----------------------\n"); // Display file dialog so user can select file to open JFileChooser fileChooser = new JFileChooser(); fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY); int result = fileChooser.showOpenDialog(this); // If cancel button selected return if (result == JFileChooser.CANCEL_OPTION) return; // Obtain selected file inputFileName = fileChooser.getSelectedFile(); textArea.append("InputSchema inputFileName = " + inputFileName + "\n\n"); // Read file if readabale (i.e not a direcrory etc.). if (checkFileName()) readInputSchemaFile(); else hasInputSchemaFlag=false; // Set all other flags to false hasDataFlag = false; hasMaxRangeFlag = false; hasNormalisationFlag = false; } /* READ INPUT SCHEMA DILE */ /** Reads input schema file and stores in array of schema items. First row is "unused", "nominal", "int" or "double". */ private void readInputSchemaFile() { try { // Open the file openInputFile(); // Read first line and tokenise String lineFromFile = fileInput.readLine(); StringTokenizer dataLine = new StringTokenizer(lineFromFile); // Get numberof columns in schema file numColsInInputSchema = dataLine.countTokens(); // Diemsion and initialise schema array schema = new InputSchema[numColsInInputSchema]; for (int index=0;index= MIN_NUM_DIVS) break; JOptionPane.showMessageDialog(null,"MAXIMUM NUMBER OF " + "DIVISIONS VALUE INPUT ERROR:\ninput = " + maxRange + "\nmaximum number of divisions input must be an " + "integer\ngreater or equal to " + MIN_NUM_DIVS + "\n"); } textArea.append("Maximum number of divisions = " + maxRange + "\n\n"); hasMaxRangeFlag=true; } catch(NumberFormatException e) { } } /* ---------------------------------------------- */ /* */ /* EDIT PANEL */ /* */ /* ---------------------------------------------- */ /* MOVE COLUMN N TO END */ /** Moves a column in the input file to the end of the input data (user prompted for column number to be moved). */ private void moveColNtoEnd() { int newColIndex; // Input column number to be removed textArea.append("MOVE COLUMN TO END:\n--------------------\n"); int colNumToBeMovedToEnd = inputColNumber("Input column number " + "to be moved"); if (colNumToBeMovedToEnd == -1) { textArea.append("Operation cancelled\n"); return; } textArea.append("Column number to be moved to end = " + (colNumToBeMovedToEnd+1) + "\n"); // Move moveColInInputData(colNumToBeMovedToEnd,numColsInInputData); moveColInInputSchema(colNumToBeMovedToEnd,numColsInInputData); // Calculate new min and max values for ranges, and number of missing // items determinMaxMinDataArray(); recalculateMissingItems(); // End textArea.append("Column number " + (colNumToBeMovedToEnd+1) + " moved to end\n\n"); } /* MOVE COLUMN N TO BEFORE COLUMN M */ /** Moves a column in the input file to another position in the input data (user prompted for column number to be moved and column number to be moved to). */ private void moveColNtoBeforeColM() { int newColIndex; // Input column number to be removed textArea.append("MOVE COLUMN:\n-------------\n"); int colNumToBeMoved = inputColNumber("Input column number to be moved"); if (colNumToBeMoved == -1) { textArea.append("Operation cancelled\n"); return; } int toColumnNumber = inputColNumber("Input column number for colum " + (colNumToBeMoved+1) + " to be moved to"); if (toColumnNumber == -1) { textArea.append("Operation cancelled\n"); return; } textArea.append("Column number " + (colNumToBeMoved+1) + " to be moved to " + (toColumnNumber+1) + "\n"); // Move int toNumber=toColumnNumber; if (toColumnNumber>colNumToBeMoved) toNumber--; moveColInInputData(colNumToBeMoved,toNumber); moveColInInputSchema(colNumToBeMoved,toNumber);; // Calculate new min and max values for ranges, and number of missing // items determinMaxMinDataArray(); recalculateMissingItems(); // End textArea.append("Column number " + (colNumToBeMoved+1) + " moved to " + (toColumnNumber+1) + "\n\n"); } /* MOVE COLUMN IN INPUT DATA */ /** Moves column in input data structure. @param colToBeMoved The column number identifier for the column to be moved. @param toColumnNumber The column number identifier for the column before which the column to be moved is tombe inswerted. */ private void moveColInInputData(int colToBeMoved, int toColumnNumber) { int localNumRows = inputDataArray.length; // Store input for column to be moved} double[] inputToBeMoved = new double[localNumRows]; for (int rowN=0;rowNmoveColNtoEnd, moveColNtoBeforeColM and removeColN). @param s1 The input string to be output in message window @return the column number input by the user. */ private int inputColNumber(String s1) { int colNumber=0; try{ while (true) { String stNum1 = JOptionPane.showInputDialog(s1 + " (integer\n" + "within the range 1 and " + numColsInInputData + ")\n\n"); // Check for operation cancelled if (stNum1 == null) return(-1); colNumber = Integer.parseInt(stNum1); if (colNumber >= 1 && colNumber <= numColsInInputData) break; JOptionPane.showMessageDialog(null, "INVALID COLUMN NUMBER INPUT ERROR:\n" + "input = " + colNumber + "\ncolumn number number must be within the\n" + "range 1 and " + numColsInInputData + "\n\n"); } } catch(NumberFormatException e) { } // Return return(colNumber-1); } /* CONFIRM */ /** Creates a message window to allow user to confirm a given colum number (used with removeColN edit method). @param colNumber The column number to be confirmed @return true of OK, false otherwise. */ private boolean confirmColRemoval(int colNumber) { int result = JOptionPane.showConfirmDialog(null, "CONFIRM COLUMN NUMBER TO BE REMOVED:\nAre you sure you " + "wish to remove column " + (colNumber+1) + "\nfrom the " + "input data set? (THIS OPERATION IS\nNOT REVERSABLE!)\n\n", "Confirm",JOptionPane.YES_NO_OPTION); if (result==0) return(true); else return(false); } /* ----------------------------------------------- */ /* */ /* DISTRIBUTE */ /* */ /* ----------------------------------------------- */ /* DISTRIBUTE CLASSES */ /* Distributes records through data set so that classes are equally distributed. */ private void distributeClasses() { int[][] localDataArray = new int[outputDataArray.length][]; for (int index=0;index1) { randomIndex = (int) (Math.random()*(numRecords-1)); for (int colIndex=0;colIndex The 2-D count array used in this method (and its sub-methods) is used to store the number of occurances of a possible value for the current attribute with respect to the different possible classes. The first index is set as follows:
  1. If attribute is of type "integer" and less than 100 possible values to the number of values
  2. If attribute is of type "integer" and more than 100 possible values or attribute is if type "double" then to 100. */ private void processInputSchema() { // Declare count array int[][] countArray; // Step through schema up until the class attribute for(int colIndex=0;colIndex<(schema.length-1);colIndex++) { // Integer if (schema[colIndex].typeName.equals("int")) { // Define N element count array int length = 1 + (int) schema[colIndex].range; if (length<100) countArray = new int[length][numClasses+1]; else countArray = new int[100][numClasses+1]; processInputSchemaNumeric(countArray,colIndex); } // Double else if (schema[colIndex].typeName.equals("double")) { // Define a 100 element count array such that each element // represents a division. countArray = new int[100][numClasses+1]; processInputSchemaNumeric(countArray,colIndex); } // Nominal else if (schema[colIndex].typeName.equals("nominal")) { processInputSchemaNominal(colIndex); } // Otherwise unused (ignore) } } /* ---------------------------------------------------------- */ /* */ /* PROCESS SCHEMA NUMERIC */ /* */ /* ---------------------------------------------------------- */ /* PROCESS SCHEMA NUMERIC */ /** Identify sub-ranges, in numeric schema items, to be used in the normalisation.

    Note that count array is emty at this point. @param countArray the given array to hold support counts for particular values of the given attrivute matched to the available classes. @param colIndex the index in the data array of the current attribute to be provessed. */ private void processInputSchemaNumeric(int[][] countArray, int colIndex) { // Process elements in the indicated data array column and increment // the count array element values so that count array contains support // for the different possible values (represented by the range of the // given attribut), with respect to the given class. genSupportValuesNumeric(colIndex,countArray); // Determine dominant classes determineDominantClasses(countArray); // Create numerical division linked list int numDivisions = createDivisionsLinkedList(countArray); // Process division linked list merging divisions until required // number of divisions is reached. while (numDivisions > maxRange) numDivisions = mergeDivision(numDivisions); // Store divisions in schema array storeDivisionsInInputSchemaArray(numDivisions,colIndex, (double) countArray.length-1); } /* GENERATE SUPPORT VALUES NUMERIC */ /** Process elements in the indicated data array column and increment the count array element values so that count array contains support for the different possible values (represented by the range of the given attribut), with respect to the given class.

    Used with numeric attributes ('int' or 'double'). @param colIndex the index in the data array of the current attribute. @param countArray the given array of support counts. */ private void genSupportValuesNumeric(int colIndex, int countArray[][]) { // Calculate multiplyer double constantK = (double) countArray.length-1; // Get undex for class attribute int lastIndex = schema.length-1; // Process input data for (int rowIndex=0;rowIndex Note that for some columns there many be no data and therefore no immediate dominant class can be deduced. Similarly there may also be columns where the counts are equivalent, i.e. again no dominant class exits. In these cases the dominant class must be determined according to the dominatnt classes identified by the neigbouring columns (representing values) in the count array. Note also that If count array contains no dominant classes at all the dominant class for each value is set to 0. @param countArray the given array of support counts. */ private void determineDominantClasses(int countArray[][]) { int lastDominantIndex=-1; int i; boolean atStart=true; // Looop through count array for(i=0;i If so assigns this dominant class to the last element of the given array. @param array the given array. @return true if given array contains a dominant class and false otherwise. */ private boolean hasDominantClass(int array[]) { // Get start max value int maxValue=array[0]; int maxIndex=0; // Process rest of array (note that it may be that the given array // only has one value. for(int i=1;i<(array.length-1);i++) { if (maxValue < array[i]) { maxValue=array[i]; maxIndex=i; } } // Check that there is only one maximum value, if not return false int count=0; boolean hasDomClass = true; for(int i=0;i<(array.length-1);i++) { if (maxValue == array[i]) count++; if (count>1) return(!hasDomClass); } // Dominant class exists therefore allocate dominant class to to last // row of array and return. array[array.length-1]=maxIndex; return(hasDomClass); } /* CREATE DIVISIONS LINKED LIST */ /** Create a linked list describing the parameters for each division in the given count array.

    The divisions describe sub-ranges of values in the count array where all the values have the same dominant class. It may be that the attribute in question has no dominant class ay all in which case only ine division will be created. @param countArray the given array of support counts. @return the number of divisions. */ private int createDivisionsLinkedList(int[][] countArray) { Division nextMarker = null; int numDivisions = 0; // Get start value and index int startIndex = 0; int currentValue = countArray[0][numClasses]; // Create temp array of copy of first entry in count array int temp[] = new int[numClasses]; for(int j=0;j

  3. Create empty "dominant class" array indicating the dominant class shared at a division boundary. Each element represents a division boundary
  4. Create probabilities array indicating probability for the dominant class shared at division boundaries.
  5. Identify divison boundray with the highest probability for dominant class where the divisions on either side of the boundary merged.
  6. Merge divisions at selected division boundary (has effect of removing current division).
  7. Check that no further merges are required with divisions immediately before or after the merged pair. It may be that either the preceeding or following division of the newly merged division has the same class as the nelwy merged division in which case this should also be merged.
@param numDivisions the current number of divisiosn. */ private int mergeDivision(int numDivisions) { // Create empty "dominant class" array. int[] domClass = new int[numDivisions-1]; // Create probabilities array. double[] probArray = calcBoundaryProbs(numDivisions,domClass); // Select "best" divison boundary. int bestIndex = idBestDivBoundary(probArray); int domClassLabel = domClass[bestIndex]; // Merge divisions at selected division boundary. Commence by finding // division leading up to the boundary. Division nextRef = startDivisions; for (int i=0;iThis is used to detremine which divisons to merge. @param numDivisions the current number of divisiosn. @param domClass array in which to place identified dominant class label. */ private double[] calcBoundaryProbs(int numDivisions, int domClass[]) { Division nextRef = startDivisions.linkRef; int index = 0; // Create empty probability array double probArray[] = new double[numDivisions-1]; // Process linked list of divisions while (nextRef != null) { probArray[index] = calcBoundaryProb(nextRef.backRef,nextRef,index, domClass); index++; nextRef = nextRef.linkRef; } // Return return(probArray); } /** Determines the dominant class and its associate probability across two divisions sharing a common boundary.

The identified dominant class is placed in an array of dominant classes (passed in as an argument) whose indexes match to division identifiers (numbers). @param linkRef1 the first given divisiosn to be considered. @param linkRef2 the second given divisiosn to be considered. @param index the attribute index. @param domClass array in which to place identified dominant class label. @return the calculated best probability associated with the dominant class. */ private double calcBoundaryProb(Division linkRef1, Division linkRef2, int index, int domClass[]) { // Calculate total number of records per class represented by the // two divisions. int numRecords[] = new int[numClasses]; int totalNumRcrds = linkRef1.totalNumRecords + linkRef2.totalNumRecords; for (int i=0;icolIndex). @param colIndex the index in the data array of the current attribute to be processed. */ private void processInputSchemaNominal(int colIndex) { // Define 2-D count array (array of elments representing the possible // nominal values). int countArrayLength = schema[colIndex].nominalData.length; int[][] countArray = new int[countArrayLength][numClasses+1]; // Process data array and increment vount array element values // accordingly so that the count array contains support for each // nominal value with respect to each class. genSupportValuesNominal(colIndex,countArray); // Identify the dominant class fort each value (column) in the count // array. determineDominantClassesNominal(countArray); // Create nominal division linked list int numDivisions = createNominalDivsLinkedList(colIndex,countArray); // Process nominal division linked list to determine if any merging // can be undertaken if (numDivisions==0) { textArea.append("WARNING: Attribute " + schema[colIndex].name + " does not feature in any record\n"); } else numDivisions = procNominalDivLinkedList(numDivisions); // Store nominal divisions in schema array storeNominalDivsInInputSchemaArray(numDivisions,colIndex); } /* ------ GENERATE SUPPORT VALUES NOMINAL ------ */ /** Processes data set with respect to current nominal attribute and determines support counts for each element in the given count array @param colIndex the index in the data array of the current attribute. @param countArray the given array of support counts. */ private void genSupportValuesNominal(int colIndex, int countArray[][]) { // Get undex for class attribute int lastIndex = schema.length-1; // Process input data for (int rowIndex=0;rowIndex Note that for some elements there many be no data and therefore no dominant class can be deduced (similarly there will also be columns where the counts are equivalent and consequently no dominant class can be produced also). In these cases the dominant class value is set to -1. @param countArray the given array of support counts. */ private void determineDominantClassesNominal(int countArray[][]) { int lastDominantIndex=-1; int i; boolean atStart=true; // Loop through count array identifying dominant class in each case for(i=0;i 0.9) && (nextRef2.probability[nextRef2.dominantClass] > 0.9)) { addToFirstNominalDiv(nextRef1,nextRef2); return(true); } } nextRef2 = nextRef2.linkRef; } // Default return, no similar record found. return(false); } /* ------ ADD TO FIRST NOMINAL DIVISION ------ */ /** Combines two nominal divisions by adding the second given division to the first. @param firstDiv the division which is to be added to. @param secondDiv the division to be deleted. */ private void addToFirstNominalDiv(NominalDiv firstDiv, NominalDiv secondDiv) { // add values from second division to value list for first division. int newLength = firstDiv.listOfValues.length + secondDiv.listOfValues.length; int newListOfVlaues[] = new int[newLength]; int j=0; for (int i=0;i Operates as follows: 1) Consider each attribute in turn and identify appropriate boundaries. */ private void normalise() { textArea.append("NORMALISE DATA\n----------------\n"); // Calculate number of classes, if possible continue processing (must // be a nominal field). if (calcNumberOfClasses()) { // Process schema to identify ranges which in turn will define // attribute numbering. processInputSchema(); // Process determineBinaryOutput(); // Generate classes array (output purposes) generateClassesArray(); // End, set flag to true (to allow further processing), and output // message. hasNormalisationFlag = true; textArea.append("Normalisation complete\n\n"); } else textArea.append("Normalisation failed\n\n"); } /* DETERMINE BINARY OUTPUT */ /** Commence process of identifying binary output. */ private void determineBinaryOutput() { int attNumber=1; // Dimension output data array outputDataArray = new int[numRows][numColsInInputData]; // Process rows for (int rowN=0;rowNTEST_TERM) outputDataArray[rowN][colN] = (int) MISSING_ITEM; // Otherwise process else { boolean done =false; for (int i=0;iTEST_TERM) outputDataArray[rowN][colN] = (int) MISSING_ITEM; // Determine value else{ boolean notDone = true; int minValue = (int) schema.min; for (int i=0;i= minValue) && (number < (int) schema.divisionMarkers[i])) { outputDataArray[rowN][colN]=attNum+i; notDone = false; break; } minValue = (int) schema.divisionMarkers[i]; } if (notDone) outputDataArray[rowN][colN]= attNum+schema.numDivisions-1; } // Return return(attNum+schema.numDivisions); } /* DETERMINE BINARY OUTPUT DOUBLE (VALUE) */ /** Output attribute number for of double value. @param attNum the current attribute (column) number. @param number the given value in the current row/column intersection in the input data structure. @param rowIndex the current row index in the input data structure. @param colIndex the current column index in the input data structure. @param schema the current attribute in the schema. @return the revised attribute number.*/ private int determineBinaryOutputDouble(int attNum, double number, int rowN, int colN, InputSchema schema) { // Test for missing value if (number>TEST_TERM) outputDataArray[rowN][colN] = (int) MISSING_ITEM; // Determine value else { boolean notDone = true; double minValue = schema.min; // Check if contained in divisions before last. for (int i=0;igeberateClassArray method so that array is ordered according to frequency of classes. @param classesArray The 2-D array returned by the countSingles method. */ private void orderClassesArray() { int label, quantity; boolean isOrdered; int index; do { isOrdered = true; index = 0; while (index < (classesArray.length-1)) { if (classesArray[index][1]<=classesArray[index+1][1]) index++; else { isOrdered=false; // Swap label = classesArray[index][0]; quantity = classesArray[index][1]; classesArray[index][0] = classesArray[index+1][0]; classesArray[index][1] = classesArray[index+1][1]; classesArray[index+1][0] = label; classesArray[index+1][1] = quantity; // Increment index index++; } } } while (isOrdered==false); } /* ------------------------------------------------- */ /* */ /* OUTPUT METHODS */ /* */ /* ------------------------------------------------- */ /* LIST INPUT DATA */ /** Outputs stored input data set; initially read from input data file. */ private void listInputData() { textArea.append("LIST DATA\n--------\n"); for(int index=0;indexTEST_TERM) itemSetStr = "?"; else itemSetStr = Double.toString(itemSet[0]); // Output rest for (int index=1;indexTEST_TERM) itemSetStr = itemSetStr + " ?"; else itemSetStr = itemSetStr + " " + itemSet[index]; } // Output textArea.append(itemSetStr); } /** Outputs a given item set of integers (omits missing values). @param itemSet the given item set. */ private void outputItemSet(int[] itemSet) { String itemSetStr = null; boolean noItemsInItemSetStr = true; // If empty item set return if (itemSet==null) return; // Output rest for (int index=0;index Done by processing schema array. */ private void listOutputSchema() { // Start textArea.append("LIST OUTPUT SCHEMA\n---------------------\n"); listOutputSchema2(); // End textArea.append("\n"); } /** Continues process of listing output schema (attribute labels) to textArea.

Done by processing schema array. */ private void listOutputSchema2() { int attNum=1; int colIndex; for(colIndex=0;colIndex<(schema.length-1);colIndex++) { // Integer if (schema[colIndex].typeName.equals("int")) attNum = outputIntLabel(attNum,schema[colIndex]); // Double if (schema[colIndex].typeName.equals("double")) attNum = outputDoubleLabel(attNum,schema[colIndex]); // Nominal if (schema[colIndex].typeName.equals("nominal")) attNum = outputNominalLabel(attNum,schema[colIndex]); } // Output class labels outputClassLabel(attNum,schema[colIndex]); } /* OUTPUT INTEGER LABEL */ /** Outputs the attribute numbers and labels for an integer column. @param attNum The current attribute number. @param schema the current attribute in the schema. @return the new current attribite number. */ private int outputIntLabel(int attNum, InputSchema schema) { // Only one division if (schema.numDivisions==1) { outputLabel(attNum,schema.name); attNum++; // Return return(attNum); } // If more than one divisiosn output first label String label = (schema.name + " <= " + (int) schema.divisionMarkers[0]); outputLabel(attNum,label); attNum++; // Output rest of labels int minValue = (int) schema.divisionMarkers[0]; for (int i=1;i<(schema.numDivisions-1);i++) { label = (minValue + " < " + schema.name + " <= " + (int) schema.divisionMarkers[i]); outputLabel(attNum,label); minValue = (int) schema.divisionMarkers[i]; attNum++; } // Output last label label = (minValue + " < " + schema.name); outputLabel(attNum,label); attNum++; // Return return(attNum); } /* OUTPUT DOUBLE LABEL */ /** Outputs the attribute numbers and labels for an double column. @param attNum The current attribute number. @param schema the current attribute in the schema. @return the new current attribite number. */ private int outputDoubleLabel(int attNum, InputSchema schema) { // Only one division if (schema.numDivisions==1) { outputLabel(attNum,schema.name); attNum++; // Return return(attNum); } // Output first label String label = (schema.name + " <= " + outputToNdecPlaces(schema.divisionMarkers[0],3)); outputLabel(attNum,label); attNum++; // Output rest of labels double minValue = schema.divisionMarkers[0]; for (int i=1;i<(schema.numDivisions-1);i++) { label = (outputToNdecPlaces(minValue,3) + " < " + schema.name + " <= " + outputToNdecPlaces(schema.divisionMarkers[i],3)); outputLabel(attNum,label); minValue = schema.divisionMarkers[i]; attNum++; } // Output last label label = (outputToNdecPlaces(minValue,3) + " < " + schema.name); outputLabel(attNum,label); attNum++; // Return return(attNum); } /* OUTPUT NOMINAL LABEL */ /** Produces label for nominal field. @param attNum The current attribute number. @param schema the current attribute in the schema. @return the new current attribite number. */ private int outputNominalLabel(int attNum, InputSchema schema) { for (int i=0;i0) label = label + ","; label = label + schema.nominalData[schema.nominalDivs[i][j]]; } label = label + "}"; outputLabel(attNum,label); attNum++; } // Return return(attNum); } /* OUTPUT CLASS LABEL */ /** Produces label for class field. @param attNum The current attribute number. @param schema the current attribute in the schema. */ private void outputClassLabel(int attNum, InputSchema schema) { for (int index=0;index0) System.out.print(", "); System.out.print(nextRef.listOfValues[i]); } System.out.print("}, domClass = " + nextRef.dominantClass + "\n\tnumRecords = {" + nextRef.numRecords[0]); for (int i=1;i