From 1fc3ed5402ac1c12c97fdddd9db7454ad03fa171 Mon Sep 17 00:00:00 2001 From: Brandon Rodriguez <brodriguez8774@gmail.com> Date: Sun, 17 May 2020 09:37:38 -0400 Subject: [PATCH] Implement some core statistical functions --- documents/references.md | 28 +++++++ main.py | 62 ++++++++++++++- resources/stat_functions.py | 149 ++++++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 resources/stat_functions.py diff --git a/documents/references.md b/documents/references.md index b813060..773e042 100644 --- a/documents/references.md +++ b/documents/references.md @@ -20,3 +20,31 @@ Logging files can be found in `resources/logs/`. ### Reading Files <https://www.pythonforbeginners.com/files/reading-and-writing-files-in-python> + +### Calling One Static Method from Another +<https://stackoverflow.com/a/1859975> + +### Numpy/Scipy +Numpy seems to be the recommended way (in Python) to handle large datasets for "scientific" usage. I recall using it in +the Neural Net class I took a few years ago, so it seems to make sense to use it again here. + +Even more so because, looking at the docs, it seems able to compute things much faster than one would attain with normal +Python code. + +Scipy is a scientific library that seems to now be generally associated with Numpy. + +#### Numpy Basics +<https://numpy.org/devdocs/user/quickstart.html> + +#### Python Array to Numpy Array +<https://stackoverflow.com/questions/10121926/initialise-numpy-array-of-unknown-length> + +#### Numpy Aggregation Functions +<https://jakevdp.github.io/PythonDataScienceHandbook/02.04-computation-on-arrays-aggregates.html> + +#### Finding the Range of Data +Looks like this isn't built into scipy/numpy. Have to create it ourselves, which isn't difficult. +<https://stackoverflow.com/a/12701694> + +#### Finding the Mode of Data +<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html> diff --git a/main.py b/main.py index ed3f932..69a6c3c 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ Covers basic statistical measurements in data mining. # User Imports. from resources import logging as init_logging +from resources.stat_functions import DictAggregations # Initialize Logger. @@ -27,6 +28,9 @@ def calc_part_1(): Logic for "part 1" of assignment. """ student_array = [] + id_agg = {} + midterm_agg = {} + final_agg = {} # Open file. with open('./documents/data.online.scores.txt', 'r') as student_scores_file: @@ -42,7 +46,63 @@ def calc_part_1(): 'final': int(split_line[2]), }) - logger.info(student_array[len(student_array) - 1]) + # logger.info(student_array[len(student_array) - 1]) + + + # Get min. + id_agg['min'] = DictAggregations.min(student_array, 'id') + midterm_agg['min'] = DictAggregations.min(student_array, 'midterm') + final_agg['min'] = DictAggregations.min(student_array, 'final') + + # Get max. + id_agg['max'] = DictAggregations.max(student_array, 'id') + midterm_agg['max'] = DictAggregations.max(student_array, 'midterm') + final_agg['max'] = DictAggregations.max(student_array, 'final') + + # Get sum. + id_agg['sum'] = DictAggregations.sum(student_array, 'id') + midterm_agg['sum'] = DictAggregations.sum(student_array, 'midterm') + final_agg['sum'] = DictAggregations.sum(student_array, 'final') + + # Get mean. + id_agg['mean'] = DictAggregations.mean(student_array, 'id') + midterm_agg['mean'] = DictAggregations.mean(student_array, 'midterm') + final_agg['mean'] = DictAggregations.mean(student_array, 'final') + + # Get median. + id_agg['median'] = DictAggregations.median(student_array, 'id') + midterm_agg['median'] = DictAggregations.median(student_array, 'midterm') + final_agg['median'] = DictAggregations.median(student_array, 'final') + + # Get mode. + id_agg['mode'] = DictAggregations.mode(student_array, 'id') + midterm_agg['mode'] = DictAggregations.mode(student_array, 'midterm') + final_agg['mode'] = DictAggregations.mode(student_array, 'final') + + # Get variance. + id_agg['variance'] = DictAggregations.variance(student_array, 'id') + midterm_agg['variance'] = DictAggregations.variance(student_array, 'midterm') + final_agg['variance'] = DictAggregations.variance(student_array, 'final') + + # Get standard deviation. + id_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'id') + midterm_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'midterm') + final_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'final') + + # Display results. + logger.info('') + logger.info('Id Analysis Results (for debugging):') + for key, item in id_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + logger.info('Midterm Analysis Results:') + for key, item in midterm_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + logger.info('Final Analysis Results:') + for key, item in final_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') if __name__ == '__main__': diff --git a/resources/stat_functions.py b/resources/stat_functions.py new file mode 100644 index 0000000..9820d05 --- /dev/null +++ b/resources/stat_functions.py @@ -0,0 +1,149 @@ +""" +Various stat functions to run. +""" + +# System Imports. +import numpy +from scipy import stats + +# User Imports. +from resources import logging as init_logging + + +# Initialize Logger. +logger = init_logging.get_logger(__name__) + + +class DictAggregations(): + """ + These assume storage in the format of an array of dictionaries, where each dictionary represents a record. + Forms aggregation functions on a single dict attribute for all present records. + Works fine for small data amounts of data but probably not very efficient for larger datasets. + """ + @staticmethod + def min(struct, attribute): + """ + Finds the min of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Min of dict attribute. + """ + logger.info('Finding min of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.min(np_array) + + @staticmethod + def max(struct, attribute): + """ + Finds the max of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Max of dict attribute. + """ + logger.info('Finding max of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.max(np_array) + + @staticmethod + def sum(struct, attribute): + """ + Sums the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding sum of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.sum(np_array) + + @staticmethod + def mean(struct, attribute): + """ + Finds the mean of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding mean of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.mean(np_array) + + @staticmethod + def median(struct, attribute): + """ + Finds the median of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding median of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.median(np_array) + + @staticmethod + def mode(struct, attribute): + """ + Finds the mode of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding mode of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return stats.mode(np_array) + + @staticmethod + def variance(struct, attribute): + """ + Finds the variance of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding variance of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.var(np_array) + + @staticmethod + def standard_deviation(struct, attribute): + """ + Finds the standard deviation of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding standard deviation of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.std(np_array) -- GitLab