From 96e143b150f09c766d45f5fa7fb8092974b8ae10 Mon Sep 17 00:00:00 2001 From: Brandon Rodriguez <brodriguez8774@gmail.com> Date: Sun, 17 May 2020 11:11:31 -0400 Subject: [PATCH] Create additional aggregation functions --- main.py | 49 +---------- resources/stat_functions.py | 158 ++++++++++++++++++++++++++++++------ 2 files changed, 138 insertions(+), 69 deletions(-) diff --git a/main.py b/main.py index 69a6c3c..0e743bf 100644 --- a/main.py +++ b/main.py @@ -6,14 +6,13 @@ Covers basic statistical measurements in data mining. # User Imports. from resources import logging as init_logging -from resources.stat_functions import DictAggregations +from resources.stat_functions import Aggregations # Initialize Logger. logger = init_logging.get_logger(__name__) - def main(): """ Program main. @@ -28,9 +27,6 @@ def calc_part_1(): Logic for "part 1" of assignment. """ student_array = [] - id_agg = {} - midterm_agg = {} - final_agg = {} # Open file. with open('./documents/data.online.scores.txt', 'r') as student_scores_file: @@ -48,46 +44,9 @@ def calc_part_1(): # logger.info(student_array[len(student_array) - 1]) - - # Get min. - id_agg['min'] = DictAggregations.min(student_array, 'id') - midterm_agg['min'] = DictAggregations.min(student_array, 'midterm') - final_agg['min'] = DictAggregations.min(student_array, 'final') - - # Get max. - id_agg['max'] = DictAggregations.max(student_array, 'id') - midterm_agg['max'] = DictAggregations.max(student_array, 'midterm') - final_agg['max'] = DictAggregations.max(student_array, 'final') - - # Get sum. - id_agg['sum'] = DictAggregations.sum(student_array, 'id') - midterm_agg['sum'] = DictAggregations.sum(student_array, 'midterm') - final_agg['sum'] = DictAggregations.sum(student_array, 'final') - - # Get mean. - id_agg['mean'] = DictAggregations.mean(student_array, 'id') - midterm_agg['mean'] = DictAggregations.mean(student_array, 'midterm') - final_agg['mean'] = DictAggregations.mean(student_array, 'final') - - # Get median. - id_agg['median'] = DictAggregations.median(student_array, 'id') - midterm_agg['median'] = DictAggregations.median(student_array, 'midterm') - final_agg['median'] = DictAggregations.median(student_array, 'final') - - # Get mode. - id_agg['mode'] = DictAggregations.mode(student_array, 'id') - midterm_agg['mode'] = DictAggregations.mode(student_array, 'midterm') - final_agg['mode'] = DictAggregations.mode(student_array, 'final') - - # Get variance. - id_agg['variance'] = DictAggregations.variance(student_array, 'id') - midterm_agg['variance'] = DictAggregations.variance(student_array, 'midterm') - final_agg['variance'] = DictAggregations.variance(student_array, 'final') - - # Get standard deviation. - id_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'id') - midterm_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'midterm') - final_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'final') + id_agg = Aggregations.all(student_array, 'id', display=False) + midterm_agg = Aggregations.all(student_array, 'midterm', display=False) + final_agg = Aggregations.all(student_array, 'final', display=False) # Display results. logger.info('') diff --git a/resources/stat_functions.py b/resources/stat_functions.py index 9820d05..8139734 100644 --- a/resources/stat_functions.py +++ b/resources/stat_functions.py @@ -1,5 +1,9 @@ """ Various stat functions to run. + +These assume storage in the format of an array of dictionaries, where each dictionary represents a record. +Thus, they work fine for small data amounts of data but are probably not very efficient for larger datasets. +Larger datasets (where efficiency and runtime matter) should probably instead use direct numpy arrays. """ # System Imports. @@ -14,21 +18,53 @@ from resources import logging as init_logging logger = init_logging.get_logger(__name__) -class DictAggregations(): +class Aggregations(): """ - These assume storage in the format of an array of dictionaries, where each dictionary represents a record. Forms aggregation functions on a single dict attribute for all present records. - Works fine for small data amounts of data but probably not very efficient for larger datasets. """ @staticmethod - def min(struct, attribute): + def all(struct, attribute, display=True): + """ + Finds all available aggregations of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. + :return: All available aggregations of dict attribute. + """ + if display: + logger.info('Finding all aggregations of "{0}".'.format(attribute)) + + # Run all aggregations and store to dictionary. + agg_dict = {} + agg_dict['min'] = Aggregations.min(struct, attribute, display=display) + agg_dict['max'] = Aggregations.max(struct, attribute, display=display) + agg_dict['sum'] = Aggregations.sum(struct, attribute, display=display) + agg_dict['mean'] = Aggregations.mean(struct, attribute, display=display) + agg_dict['median'] = Aggregations.median(struct, attribute, display=display) + mode_struct = Aggregations.mode(struct, attribute, display=display) + agg_dict['mode'] = mode_struct.mode[0] + agg_dict['mode_count'] = mode_struct.count[0] + agg_dict['range'] = Aggregations.range(struct, attribute, display=display) + agg_dict['variance'] = Aggregations.variance(struct, attribute, display=display) + agg_dict['standard_deviation'] = Aggregations.standard_deviation(struct, attribute, display=display) + agg_dict['first_quartile'] = Aggregations.first_quartile(struct, attribute, display=display) + agg_dict['second_quartile'] = Aggregations.second_quartile(struct, attribute, display=display) + agg_dict['third_quartile'] = Aggregations.third_quartile(struct, attribute, display=display) + + # Return aggregations. + return agg_dict + + @staticmethod + def min(struct, attribute, display=True): """ Finds the min of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. :return: Min of dict attribute. """ - logger.info('Finding min of "{0}".'.format(attribute)) + if display: + logger.info('Finding min of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -37,14 +73,16 @@ class DictAggregations(): return numpy.min(np_array) @staticmethod - def max(struct, attribute): + def max(struct, attribute, display=True): """ Finds the max of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. :return: Max of dict attribute. """ - logger.info('Finding max of "{0}".'.format(attribute)) + if display: + logger.info('Finding max of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -53,14 +91,16 @@ class DictAggregations(): return numpy.max(np_array) @staticmethod - def sum(struct, attribute): + def sum(struct, attribute, display=True): """ Sums the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. :return: Sum of dict attribute. """ - logger.info('Finding sum of "{0}".'.format(attribute)) + if display: + logger.info('Finding sum of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -69,14 +109,16 @@ class DictAggregations(): return numpy.sum(np_array) @staticmethod - def mean(struct, attribute): + def mean(struct, attribute, display=True): """ Finds the mean of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. - :return: Sum of dict attribute. + :param display: Bool indicating if helper text should display. + :return: Mean of dict attribute. """ - logger.info('Finding mean of "{0}".'.format(attribute)) + if display: + logger.info('Finding mean of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -85,14 +127,16 @@ class DictAggregations(): return numpy.mean(np_array) @staticmethod - def median(struct, attribute): + def median(struct, attribute, display=True): """ Finds the median of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. - :return: Sum of dict attribute. + :param display: Bool indicating if helper text should display. + :return: Median of dict attribute. """ - logger.info('Finding median of "{0}".'.format(attribute)) + if display: + logger.info('Finding median of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -101,14 +145,16 @@ class DictAggregations(): return numpy.median(np_array) @staticmethod - def mode(struct, attribute): + def mode(struct, attribute, display=True): """ Finds the mode of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. - :return: Sum of dict attribute. + :param display: Bool indicating if helper text should display. + :return: Mode of dict attribute. """ - logger.info('Finding mode of "{0}".'.format(attribute)) + if display: + logger.info('Finding mode of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -117,14 +163,31 @@ class DictAggregations(): return stats.mode(np_array) @staticmethod - def variance(struct, attribute): + def range(struct, attribute, display=True): + """ + Finds the range of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. + :return: Range of dict attribute. + """ + if display: + logger.info('Finding range of "{0}".'.format(attribute)) + + # Return aggregation. + return Aggregations.max(struct, attribute, display=False) - Aggregations.min(struct, attribute, display=False) + + @staticmethod + def variance(struct, attribute, display=True): """ Finds the variance of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. - :return: Sum of dict attribute. + :param display: Bool indicating if helper text should display. + :return: Variance of dict attribute. """ - logger.info('Finding variance of "{0}".'.format(attribute)) + if display: + logger.info('Finding variance of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) @@ -133,17 +196,64 @@ class DictAggregations(): return numpy.var(np_array) @staticmethod - def standard_deviation(struct, attribute): + def standard_deviation(struct, attribute, display=True): """ Finds the standard deviation of the attribute (key) for the provided dict. :param struct: Data structure to aggregate on. :param attribute: Attribute to aggregate. - :return: Sum of dict attribute. + :param display: Bool indicating if helper text should display. + :return: Standard deviation of dict attribute. """ - logger.info('Finding standard deviation of "{0}".'.format(attribute)) + if display: + logger.info('Finding standard deviation of "{0}".'.format(attribute)) # Create numpy array of attribute. np_array = numpy.array([x[attribute] for x in struct]) # Return aggregation. return numpy.std(np_array) + + @staticmethod + def first_quartile(struct, attribute, display=True): + """ + Finds the first quartile of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. + :return: First quartile of dict attribute. + """ + if display: + logger.info('Finding first quartile of "{0}".'.format(attribute)) + + # Return aggregation. + return Aggregations.median(struct[:int(len(struct) / 2)], attribute, display=False) + + @staticmethod + def second_quartile(struct, attribute, display=True): + """ + Finds the second quartile (median) of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. + :return: Second quartile of dict attribute. + """ + if display: + logger.info('Finding second quartile (median) of "{0}".'.format(attribute)) + + # Return aggregation. + return Aggregations.median(struct, attribute, display=False) + + @staticmethod + def third_quartile(struct, attribute, display=True): + """ + Finds the third quartile of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :param display: Bool indicating if helper text should display. + :return: Third quartile of dict attribute. + """ + if display: + logger.info('Finding third quartile of "{0}".'.format(attribute)) + + # Return aggregation. + return Aggregations.median(struct[int(len(struct) / 2):], attribute, display=False) -- GitLab