diff --git a/main.py b/main.py index 0e743bfeb9f9e4a54ac4ac2457d47d59ca2417c1..582912c51b2945da9552e4d3245ee3d52ad9475e 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ Covers basic statistical measurements in data mining. # User Imports. from resources import logging as init_logging -from resources.stat_functions import Aggregations +from resources.stat_functions import Aggregations, Normalizations # Initialize Logger. @@ -20,6 +20,7 @@ def main(): logger.info('Starting main()') calc_part_1() + calc_part_2() def calc_part_1(): @@ -44,6 +45,50 @@ def calc_part_1(): # logger.info(student_array[len(student_array) - 1]) + # Get aggregation values. + id_agg = Aggregations.all(student_array, 'id', display=False) + midterm_agg = Aggregations.all(student_array, 'midterm', display=False) + final_agg = Aggregations.all(student_array, 'final', display=False) + + # Display results. + logger.info('') + logger.info('Id Analysis Results (for debugging):') + for key, item in id_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + logger.info('Midterm Analysis Results:') + for key, item in midterm_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + logger.info('Final Analysis Results:') + for key, item in final_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + + +def calc_part_2(): + student_array = [] + + # Open file. + with open('./documents/data.online.scores.txt', 'r') as student_scores_file: + # Read in each line and save to our struct. + for line in student_scores_file: + # Split on tabs. + split_line = line.strip().split('\t') + + # Save to struct. + student_array.append({ + 'id': int(split_line[0]), + 'midterm': int(split_line[1]), + 'final': int(split_line[2]), + }) + + # logger.info(student_array[len(student_array) - 1]) + + # Normalize data. + student_array = Normalizations.z_score(student_array) + + # Get aggregation values. id_agg = Aggregations.all(student_array, 'id', display=False) midterm_agg = Aggregations.all(student_array, 'midterm', display=False) final_agg = Aggregations.all(student_array, 'final', display=False) diff --git a/resources/stat_functions.py b/resources/stat_functions.py index 8139734e216eeac06db19e9271c0f9648cd281ff..c89828366848248c328c739a487c77cc16cdfbff 100644 --- a/resources/stat_functions.py +++ b/resources/stat_functions.py @@ -7,7 +7,7 @@ Larger datasets (where efficiency and runtime matter) should probably instead us """ # System Imports. -import numpy +import copy, numpy from scipy import stats # User Imports. @@ -257,3 +257,112 @@ class Aggregations(): # Return aggregation. return Aggregations.median(struct[int(len(struct) / 2):], attribute, display=False) + + +class Normalizations(): + @staticmethod + def min_max(struct, new_min, new_max, attribute=None, display=True): + """ + Normalizes the attribute (key) for the provided dict. + If no attribute is provided, then normalizes all attributes in provided dict. + :param struct: Data structure to normalize on. + :param new_min: New minimum to use for attribute. + :param new_max: New maximum to use for attribute. + :param attribute: Attribute to normalize. If left empty, then all attributes are normalized. + :param display: Bool indicating if helper text should display. + :return: A copy of the provided structure, with the given attribute normalized. + """ + # Copy struct so we don't override original values. + struct = copy.deepcopy(struct) + + # Check what attributes to normalize. + if attribute is None: + # Normalize all attributes. + + # Examine all attributes. + for attribute in struct[0].keys(): + if display: + logger.info('Running Min-Max Normalization on "{0}".'.format(attribute)) + + # Get relevant aggregations. + old_min = Aggregations.min(struct, attribute, display=False) + old_max = Aggregations.max(struct, attribute, display=False) + + # Loop through and normalize all values of attribute. + for index in range(len(struct)): + old_value = struct[index][attribute] + new_value = ((old_value - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min + struct[index][attribute] = new_value + else: + # Normalize single provided attribute. + if display: + logger.info('Running Min-Max Normalization on "{0}".'.format(attribute)) + + # Get relevant aggregations. + old_min = Aggregations.min(struct, attribute, display=False) + old_max = Aggregations.max(struct, attribute, display=False) + + # Loop through and normalize all values of attribute. + for index in range(len(struct)): + old_value = struct[index][attribute] + new_value = ( (old_value - old_min) / (old_max - old_min) ) * (new_max - new_min) + new_min + struct[index][attribute] = new_value + + # Return normalized struct. + return struct + + @staticmethod + def zero_mean(struct, attribute=None, display=True): + """ + Normalizes the attribute (key) for the provided dict. + If no attribute is provided, then normalizes all attributes in provided dict. + :param struct: Data structure to normalize on. + :param attribute: Attribute to normalize. If left empty, then all attributes are normalized. + :param display: Bool indicating if helper text should display. + :return: A copy of the provided structure, with the given attribute normalized. + """ + # Copy struct so we don't override original values. + struct = copy.deepcopy(struct) + + # Check what attributes to normalize. + if attribute is None: + # Normalize all attributes. + + # Examine all attributes. + for attribute in struct[0].keys(): + if display: + logger.info('Running Zero-Mean Normalization on "{0}".'.format(attribute)) + + # Get relevant aggregations. + mean = Aggregations.mean(struct, attribute, display=False) + standard_deviation = Aggregations.standard_deviation(struct, attribute, display=False) + + # Loop through and normalize all values of attribute. + for index in range(len(struct)): + old_value = struct[index][attribute] + new_value = (old_value - mean) / standard_deviation + struct[index][attribute] = new_value + else: + # Normalize single provided attribute. + if display: + logger.info('Running Zero-Mean Normalization on "{0}".'.format(attribute)) + + # Get relevant aggregations. + mean = Aggregations.mean(struct, attribute, display=False) + standard_deviation = Aggregations.standard_deviation(struct, attribute, display=False) + + # Loop through and normalize all values of attribute. + for index in range(len(struct)): + old_value = struct[index][attribute] + new_value = (old_value - mean) / standard_deviation + struct[index][attribute] = new_value + + # Return normalized struct. + return struct + + @staticmethod + def z_score(struct, attribute=None, display=True): + """ + Alias for "zero_mean" normalization. + """ + return Normalizations.zero_mean(struct, attribute=attribute, display=display)