From 1fc3ed5402ac1c12c97fdddd9db7454ad03fa171 Mon Sep 17 00:00:00 2001
From: Brandon Rodriguez <brodriguez8774@gmail.com>
Date: Sun, 17 May 2020 09:37:38 -0400
Subject: [PATCH] Implement some core statistical functions

---
 documents/references.md     |  28 +++++++
 main.py                     |  62 ++++++++++++++-
 resources/stat_functions.py | 149 ++++++++++++++++++++++++++++++++++++
 3 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100644 resources/stat_functions.py

diff --git a/documents/references.md b/documents/references.md
index b813060..773e042 100644
--- a/documents/references.md
+++ b/documents/references.md
@@ -20,3 +20,31 @@ Logging files can be found in `resources/logs/`.
 
 ### Reading Files
 <https://www.pythonforbeginners.com/files/reading-and-writing-files-in-python>
+
+### Calling One Static Method from Another
+<https://stackoverflow.com/a/1859975>
+
+### Numpy/Scipy
+Numpy seems to be the recommended way (in Python) to handle large datasets for "scientific" usage. I recall using it in
+the Neural Net class I took a few years ago, so it seems to make sense to use it again here.
+
+Even more so because, looking at the docs, it seems able to compute things much faster than one would attain with normal
+Python code.
+
+Scipy is a scientific library that seems to now be generally associated with Numpy.
+
+#### Numpy Basics
+<https://numpy.org/devdocs/user/quickstart.html>
+
+#### Python Array to Numpy Array
+<https://stackoverflow.com/questions/10121926/initialise-numpy-array-of-unknown-length>
+
+#### Numpy Aggregation Functions
+<https://jakevdp.github.io/PythonDataScienceHandbook/02.04-computation-on-arrays-aggregates.html>
+
+#### Finding the Range of Data
+Looks like this isn't built into scipy/numpy. Have to create it ourselves, which isn't difficult.
+<https://stackoverflow.com/a/12701694>
+
+#### Finding the Mode of Data
+<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html>
diff --git a/main.py b/main.py
index ed3f932..69a6c3c 100644
--- a/main.py
+++ b/main.py
@@ -6,6 +6,7 @@ Covers basic statistical measurements in data mining.
 
 # User Imports.
 from resources import logging as init_logging
+from resources.stat_functions import DictAggregations
 
 
 # Initialize Logger.
@@ -27,6 +28,9 @@ def calc_part_1():
     Logic for "part 1" of assignment.
     """
     student_array = []
+    id_agg = {}
+    midterm_agg = {}
+    final_agg = {}
 
     # Open file.
     with open('./documents/data.online.scores.txt', 'r') as student_scores_file:
@@ -42,7 +46,63 @@ def calc_part_1():
                 'final': int(split_line[2]),
             })
 
-            logger.info(student_array[len(student_array) - 1])
+            # logger.info(student_array[len(student_array) - 1])
+
+
+    # Get min.
+    id_agg['min'] = DictAggregations.min(student_array, 'id')
+    midterm_agg['min'] = DictAggregations.min(student_array, 'midterm')
+    final_agg['min'] = DictAggregations.min(student_array, 'final')
+
+    # Get max.
+    id_agg['max'] = DictAggregations.max(student_array, 'id')
+    midterm_agg['max'] = DictAggregations.max(student_array, 'midterm')
+    final_agg['max'] = DictAggregations.max(student_array, 'final')
+
+    # Get sum.
+    id_agg['sum'] = DictAggregations.sum(student_array, 'id')
+    midterm_agg['sum'] = DictAggregations.sum(student_array, 'midterm')
+    final_agg['sum'] = DictAggregations.sum(student_array, 'final')
+
+    # Get mean.
+    id_agg['mean'] = DictAggregations.mean(student_array, 'id')
+    midterm_agg['mean'] = DictAggregations.mean(student_array, 'midterm')
+    final_agg['mean'] = DictAggregations.mean(student_array, 'final')
+
+    # Get median.
+    id_agg['median'] = DictAggregations.median(student_array, 'id')
+    midterm_agg['median'] = DictAggregations.median(student_array, 'midterm')
+    final_agg['median'] = DictAggregations.median(student_array, 'final')
+
+    # Get mode.
+    id_agg['mode'] = DictAggregations.mode(student_array, 'id')
+    midterm_agg['mode'] = DictAggregations.mode(student_array, 'midterm')
+    final_agg['mode'] = DictAggregations.mode(student_array, 'final')
+
+    # Get variance.
+    id_agg['variance'] = DictAggregations.variance(student_array, 'id')
+    midterm_agg['variance'] = DictAggregations.variance(student_array, 'midterm')
+    final_agg['variance'] = DictAggregations.variance(student_array, 'final')
+
+    # Get standard deviation.
+    id_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'id')
+    midterm_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'midterm')
+    final_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'final')
+
+    # Display results.
+    logger.info('')
+    logger.info('Id Analysis Results (for debugging):')
+    for key, item in id_agg.items():
+        logger.info('    {0}: {1}'.format(key, item))
+    logger.info('')
+    logger.info('Midterm Analysis Results:')
+    for key, item in midterm_agg.items():
+        logger.info('    {0}: {1}'.format(key, item))
+    logger.info('')
+    logger.info('Final Analysis Results:')
+    for key, item in final_agg.items():
+        logger.info('    {0}: {1}'.format(key, item))
+    logger.info('')
 
 
 if __name__ == '__main__':
diff --git a/resources/stat_functions.py b/resources/stat_functions.py
new file mode 100644
index 0000000..9820d05
--- /dev/null
+++ b/resources/stat_functions.py
@@ -0,0 +1,149 @@
+"""
+Various stat functions to run.
+"""
+
+# System Imports.
+import numpy
+from scipy import stats
+
+# User Imports.
+from resources import logging as init_logging
+
+
+# Initialize Logger.
+logger = init_logging.get_logger(__name__)
+
+
+class DictAggregations():
+    """
+    These assume storage in the format of an array of dictionaries, where each dictionary represents a record.
+    Forms aggregation functions on a single dict attribute for all present records.
+    Works fine for small data amounts of data but probably not very efficient for larger datasets.
+    """
+    @staticmethod
+    def min(struct, attribute):
+        """
+        Finds the min of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Min of dict attribute.
+        """
+        logger.info('Finding min of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.min(np_array)
+
+    @staticmethod
+    def max(struct, attribute):
+        """
+        Finds the max of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Max of dict attribute.
+        """
+        logger.info('Finding max of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.max(np_array)
+
+    @staticmethod
+    def sum(struct, attribute):
+        """
+        Sums the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Sum of dict attribute.
+        """
+        logger.info('Finding sum of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.sum(np_array)
+
+    @staticmethod
+    def mean(struct, attribute):
+        """
+        Finds the mean of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Sum of dict attribute.
+        """
+        logger.info('Finding mean of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.mean(np_array)
+
+    @staticmethod
+    def median(struct, attribute):
+        """
+        Finds the median of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Sum of dict attribute.
+        """
+        logger.info('Finding median of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.median(np_array)
+
+    @staticmethod
+    def mode(struct, attribute):
+        """
+        Finds the mode of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Sum of dict attribute.
+        """
+        logger.info('Finding mode of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return stats.mode(np_array)
+
+    @staticmethod
+    def variance(struct, attribute):
+        """
+        Finds the variance of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Sum of dict attribute.
+        """
+        logger.info('Finding variance of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.var(np_array)
+
+    @staticmethod
+    def standard_deviation(struct, attribute):
+        """
+        Finds the standard deviation of the attribute (key) for the provided dict.
+        :param struct: Data structure to aggregate on.
+        :param attribute: Attribute to aggregate.
+        :return: Sum of dict attribute.
+        """
+        logger.info('Finding standard deviation of "{0}".'.format(attribute))
+
+        # Create numpy array of attribute.
+        np_array = numpy.array([x[attribute] for x in struct])
+
+        # Return aggregation.
+        return numpy.std(np_array)
-- 
GitLab