diff --git a/documents/references.md b/documents/references.md index b8130606fdd872a4903fd665338faa15817fe220..773e042f5041acf1e428c1351fa5d23c0cc91b05 100644 --- a/documents/references.md +++ b/documents/references.md @@ -20,3 +20,31 @@ Logging files can be found in `resources/logs/`. ### Reading Files <https://www.pythonforbeginners.com/files/reading-and-writing-files-in-python> + +### Calling One Static Method from Another +<https://stackoverflow.com/a/1859975> + +### Numpy/Scipy +Numpy seems to be the recommended way (in Python) to handle large datasets for "scientific" usage. I recall using it in +the Neural Net class I took a few years ago, so it seems to make sense to use it again here. + +Even more so because, looking at the docs, it seems able to compute things much faster than one would attain with normal +Python code. + +Scipy is a scientific library that seems to now be generally associated with Numpy. + +#### Numpy Basics +<https://numpy.org/devdocs/user/quickstart.html> + +#### Python Array to Numpy Array +<https://stackoverflow.com/questions/10121926/initialise-numpy-array-of-unknown-length> + +#### Numpy Aggregation Functions +<https://jakevdp.github.io/PythonDataScienceHandbook/02.04-computation-on-arrays-aggregates.html> + +#### Finding the Range of Data +Looks like this isn't built into scipy/numpy. Have to create it ourselves, which isn't difficult. +<https://stackoverflow.com/a/12701694> + +#### Finding the Mode of Data +<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html> diff --git a/main.py b/main.py index ed3f932fff7e15d5fcfffed45021645b5934967a..69a6c3c541255bd931cf2ba4de86d28b370664eb 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ Covers basic statistical measurements in data mining. # User Imports. from resources import logging as init_logging +from resources.stat_functions import DictAggregations # Initialize Logger. @@ -27,6 +28,9 @@ def calc_part_1(): Logic for "part 1" of assignment. """ student_array = [] + id_agg = {} + midterm_agg = {} + final_agg = {} # Open file. with open('./documents/data.online.scores.txt', 'r') as student_scores_file: @@ -42,7 +46,63 @@ def calc_part_1(): 'final': int(split_line[2]), }) - logger.info(student_array[len(student_array) - 1]) + # logger.info(student_array[len(student_array) - 1]) + + + # Get min. + id_agg['min'] = DictAggregations.min(student_array, 'id') + midterm_agg['min'] = DictAggregations.min(student_array, 'midterm') + final_agg['min'] = DictAggregations.min(student_array, 'final') + + # Get max. + id_agg['max'] = DictAggregations.max(student_array, 'id') + midterm_agg['max'] = DictAggregations.max(student_array, 'midterm') + final_agg['max'] = DictAggregations.max(student_array, 'final') + + # Get sum. + id_agg['sum'] = DictAggregations.sum(student_array, 'id') + midterm_agg['sum'] = DictAggregations.sum(student_array, 'midterm') + final_agg['sum'] = DictAggregations.sum(student_array, 'final') + + # Get mean. + id_agg['mean'] = DictAggregations.mean(student_array, 'id') + midterm_agg['mean'] = DictAggregations.mean(student_array, 'midterm') + final_agg['mean'] = DictAggregations.mean(student_array, 'final') + + # Get median. + id_agg['median'] = DictAggregations.median(student_array, 'id') + midterm_agg['median'] = DictAggregations.median(student_array, 'midterm') + final_agg['median'] = DictAggregations.median(student_array, 'final') + + # Get mode. + id_agg['mode'] = DictAggregations.mode(student_array, 'id') + midterm_agg['mode'] = DictAggregations.mode(student_array, 'midterm') + final_agg['mode'] = DictAggregations.mode(student_array, 'final') + + # Get variance. + id_agg['variance'] = DictAggregations.variance(student_array, 'id') + midterm_agg['variance'] = DictAggregations.variance(student_array, 'midterm') + final_agg['variance'] = DictAggregations.variance(student_array, 'final') + + # Get standard deviation. + id_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'id') + midterm_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'midterm') + final_agg['standard_deviation'] = DictAggregations.standard_deviation(student_array, 'final') + + # Display results. + logger.info('') + logger.info('Id Analysis Results (for debugging):') + for key, item in id_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + logger.info('Midterm Analysis Results:') + for key, item in midterm_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') + logger.info('Final Analysis Results:') + for key, item in final_agg.items(): + logger.info(' {0}: {1}'.format(key, item)) + logger.info('') if __name__ == '__main__': diff --git a/resources/stat_functions.py b/resources/stat_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..9820d0559fe25e502547ba36194b1f36b348052e --- /dev/null +++ b/resources/stat_functions.py @@ -0,0 +1,149 @@ +""" +Various stat functions to run. +""" + +# System Imports. +import numpy +from scipy import stats + +# User Imports. +from resources import logging as init_logging + + +# Initialize Logger. +logger = init_logging.get_logger(__name__) + + +class DictAggregations(): + """ + These assume storage in the format of an array of dictionaries, where each dictionary represents a record. + Forms aggregation functions on a single dict attribute for all present records. + Works fine for small data amounts of data but probably not very efficient for larger datasets. + """ + @staticmethod + def min(struct, attribute): + """ + Finds the min of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Min of dict attribute. + """ + logger.info('Finding min of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.min(np_array) + + @staticmethod + def max(struct, attribute): + """ + Finds the max of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Max of dict attribute. + """ + logger.info('Finding max of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.max(np_array) + + @staticmethod + def sum(struct, attribute): + """ + Sums the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding sum of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.sum(np_array) + + @staticmethod + def mean(struct, attribute): + """ + Finds the mean of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding mean of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.mean(np_array) + + @staticmethod + def median(struct, attribute): + """ + Finds the median of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding median of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.median(np_array) + + @staticmethod + def mode(struct, attribute): + """ + Finds the mode of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding mode of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return stats.mode(np_array) + + @staticmethod + def variance(struct, attribute): + """ + Finds the variance of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding variance of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.var(np_array) + + @staticmethod + def standard_deviation(struct, attribute): + """ + Finds the standard deviation of the attribute (key) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute: Attribute to aggregate. + :return: Sum of dict attribute. + """ + logger.info('Finding standard deviation of "{0}".'.format(attribute)) + + # Create numpy array of attribute. + np_array = numpy.array([x[attribute] for x in struct]) + + # Return aggregation. + return numpy.std(np_array)