From 77d1bf3e5c5fa5fc690c55363a88da6dfe2a670b Mon Sep 17 00:00:00 2001 From: Brandon Rodriguez <brodriguez8774@gmail.com> Date: Wed, 23 May 2018 19:31:03 -0400 Subject: [PATCH] Improve normalization of data --- main.py | 3 +- neural_net.py | 184 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 127 insertions(+), 60 deletions(-) diff --git a/main.py b/main.py index a44fe90..e833d22 100644 --- a/main.py +++ b/main.py @@ -18,9 +18,10 @@ logger = logging.get_logger(__name__) housing_data = pandas.read_csv('./Documents/other_housing.csv') # Initially only work with first 10, for testing purposes. -housing_data = housing_data[0:3] +housing_data = housing_data[0:5] # Normalize data. +logger.info('') normalizer = neural_net.Normalizer() normalized_data = normalizer.normalize_data(housing_data) features = normalized_data.loc[:, normalized_data.columns != 'SalePrice'] diff --git a/neural_net.py b/neural_net.py index d60a289..b8624d9 100644 --- a/neural_net.py +++ b/neural_net.py @@ -3,6 +3,7 @@ Neural Net logic. """ # System Imports. +from sklearn import preprocessing import math, numpy, pandas # User Class Imports. @@ -17,78 +18,143 @@ class Normalizer(): """ Handles data normalization. """ - def normalize_data(self, data): + def normalize_data(self, orig_data): """ Normalizes and returns provided dataset. :param data: A pandas array of housing data to normalize. :return: The normalized array of housing data. """ # Print out data. - # logger.info(data) - # logger.info(data.columns.values) + # logger.info(orig_data) + # logger.info(orig_data.columns.values) + # logger.info(orig_data['Alley']) + + # Remove NaN references. + # orig_data = orig_data.fillna(value='NaN') # Address individual columns. normalized_data = pandas.DataFrame() - column = 'Lot Area' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'Year Built' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'Year Remod/Add' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'Fireplaces' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'Garage Area' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'Pool Area' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'Yr Sold' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] - - column = 'SalePrice' - if column in data.columns: - frame = pandas.DataFrame(data[column]) - normalized_data = normalized_data.join(frame, how='outer') - data = data.loc[:, data.columns != column] + continuous_columns = [ + 'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', + '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', + 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', + ] + discrete_columns = [ + 'Year Built', 'Year Remod/Add', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', + 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', + 'Yr Sold', + ] + categorical_columns = [ + 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', + 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', + 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', + 'Sale Condition', + ] + categorical_dict = {} + ordinal_columns = [ + 'Lot Shape', 'Land Slope', 'Overall Qual', 'Overall Cond', 'Exter Qual', 'Exter Cond', 'Bsmt Qual', + 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating QC', 'Electrical', + 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', + 'Pool QC', 'Fence', + ] + ordinal_dict = {} + ignored_columns = ['Utilities',] + target_column = ['SalePrice',] + + # Process continuous data. + for column in continuous_columns: + self.squish_values(orig_data, column) + if column in orig_data.columns: + # Add column to normalized data list. + frame = pandas.DataFrame(orig_data[column]) + normalized_data = normalized_data.join(frame, how='outer') + orig_data = orig_data.loc[:, orig_data.columns != column] + + # Process discreet data. Currently handles as if it were continuous. + for column in discrete_columns: + self.squish_values(orig_data, column) + if column in orig_data.columns: + # Add column to normalized data list. + frame = pandas.DataFrame(orig_data[column]) + normalized_data = normalized_data.join(frame, how='outer') + orig_data = orig_data.loc[:, orig_data.columns != column] + + # Process categorical data. + for column in categorical_columns: + # Remove NaN references. + orig_data[column] = orig_data[column].fillna(value='NaN') + if column in orig_data.columns: + # Turn single column into onehot matrix. + onehot_tuple = self.create_onehot(orig_data[column]) + # Add onehot matrix to normalized data list. + frame = pandas.DataFrame(onehot_tuple[0]) + normalized_data = normalized_data.join(frame, how='outer') + orig_data = orig_data.loc[:, orig_data.columns != column] + # Save newly created columns associated with the original column. + categorical_dict[column] = onehot_tuple[1] + + # Process ordinal data. Currently handles as categorical. Perhaps a better way? + for column in ordinal_columns: + # Remove NaN references. + orig_data[column] = orig_data[column].fillna(value='NaN') + if column in orig_data.columns: + # Turn single column into onehot matrix. + onehot_tuple = self.create_onehot(orig_data[column]) + # Add onehot matrix to normalized data list. + frame = pandas.DataFrame(onehot_tuple[0]) + normalized_data = normalized_data.join(frame, how='outer') + orig_data = orig_data.loc[:, orig_data.columns != column] + # Save newly created columns associated with the original column. + categorical_dict[column] = onehot_tuple[1] + + # Columns to be ignored. + for column in ignored_columns: + orig_data = orig_data.loc[:, orig_data.columns != column] + + # Target Data. Should this be normalized as well? + # Add column to normalized data list. + frame = pandas.DataFrame(orig_data[target_column]) + normalized_data = normalized_data.join(frame, how='outer') return normalized_data - def squish_values(self): + def squish_values(self, orig_data, column): """ Squishes vector values to be between 0 and 1. - :return: + Referenced from http://lamda.nju.edu.cn/weixs/project/CNNTricks/CNNTricks.html """ - pass + # logger.info('Pre Normalization: {0}'.format(orig_data[column])) + x_value = orig_data[column] + x_value -= numpy.mean(x_value, axis=0) # Zero-center. + x_value /= numpy.std(x_value, axis=0) # Normalize. + # logger.info('Post Normalization: {0}'.format(orig_data[column])) + + + def create_onehot(self, column): + """ + Creates a onehot of data based on the given column. + Each unique value is turned into a new column. + The index will have a 1 on the respective valid column, and 0 for all others. + :param column: Column of data to onehot. + :return: Onehot of data. Columns are denoted by "columnName__value". + """ + # Used Dillon's magic code as reference to create onehot. + label_enc = preprocessing.LabelEncoder() + label_enc.fit(column) + int_label = label_enc.transform(column) + int_label = int_label.reshape(-1,1) + column_enc = preprocessing.OneHotEncoder(sparse=False) + column_onehot = column_enc.fit_transform(int_label) + + # Create meaningful labels for onehot, using initial category values. + new_labels = [] + for label in label_enc.classes_: + new_labels.append(str(column.name) + '__' + str(label)) + column_array = pandas.DataFrame(column_onehot, columns=new_labels) + + return [column_array, label_enc.classes_] - def separate_categories(self): - pass class BackPropNet(): """ @@ -105,8 +171,7 @@ class BackPropNet(): Each layer has sets of weights equal to the number of nodes in the layer. Each set of weights has x values where x is the number of nodes in the previous layer, plus a bias. Weight values are randomized values near 0, using a normal distribution. - :param data: - :return: + :param data: Data to reference for input layer count. """ # Create first hidden layer. hidden_layer_1 = [] @@ -132,10 +197,10 @@ class BackPropNet(): self.network.append(hidden_layer_2) self.network.append(output_layer) - logger.info('Network:') + # logger.info('Network:') index = 0 for layer in self.network: - logger.info('Layer {0}: {1}'.format(index, layer)) + # logger.info('Layer {0}: {1}'.format(index, layer)) index += 1 def activation(self, weights, inputs): @@ -176,6 +241,7 @@ class BackPropNet(): :return: Output results of network. """ outputs = None + # Iterate through each value in network, using previous outputs as new inputs. for layer in self.network: outputs = [] for neuron in layer: -- GitLab