diff --git a/main.py b/main.py index 582912c51b2945da9552e4d3245ee3d52ad9475e..1e0507b2af5f19e8d919dcc45abcafdf8180271b 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ Covers basic statistical measurements in data mining. # User Imports. from resources import logging as init_logging -from resources.stat_functions import Aggregations, Normalizations +from resources.stat_functions import Aggregations, Normalizations, RelationalAnalysis # Initialize Logger. @@ -23,6 +23,36 @@ def main(): calc_part_2() +def test(): + """ + Should probably make this into unit testing but then I'd feel obligated to unittest all my logic. + Also this assignment is due soon and I've spent the past week straight relearning statistics, + so I just don't have time to make proper unittests. Rip. + """ + # Test covariance against https://youtu.be/0nZT9fqr2MU + struct = [ + {'x': 2.1, 'y': 8}, + {'x': 2.5, 'y': 10}, + {'x': 3.6, 'y': 12}, + {'x': 4.0, 'y': 14}, + ] + + result = RelationalAnalysis.covariance(struct, 'x', 'y') + logger.info('{0}'.format(result)) + + # Test correlation coefficient against https://youtu.be/jBQz2RGxCek + struct = [ + {'x': 43, 'y': 99}, + {'x': 21, 'y': 65}, + {'x': 25, 'y': 79}, + {'x': 42, 'y': 75}, + {'x': 57, 'y': 87}, + {'x': 59, 'y': 81}, + ] + + RelationalAnalysis.pearsons_correlation_coefficient(struct, 'x', 'y') + + def calc_part_1(): """ Logic for "part 1" of assignment. @@ -86,9 +116,13 @@ def calc_part_2(): # logger.info(student_array[len(student_array) - 1]) # Normalize data. - student_array = Normalizations.z_score(student_array) + norm_student_array = Normalizations.z_score(student_array, attribute='midterm') + norm_student_array = Normalizations.z_score(norm_student_array, attribute='final') # Get aggregation values. + norm_id_agg = Aggregations.all(Normalizations.z_score(student_array, attribute='id'), 'id', display=False) + norm_midterm_agg = Aggregations.all(norm_student_array, 'midterm', display=False) + norm_final_agg = Aggregations.all(norm_student_array, 'final', display=False) id_agg = Aggregations.all(student_array, 'id', display=False) midterm_agg = Aggregations.all(student_array, 'midterm', display=False) final_agg = Aggregations.all(student_array, 'final', display=False) @@ -96,15 +130,15 @@ def calc_part_2(): # Display results. logger.info('') logger.info('Id Analysis Results (for debugging):') - for key, item in id_agg.items(): + for key, item in norm_id_agg.items(): logger.info(' {0}: {1}'.format(key, item)) logger.info('') logger.info('Midterm Analysis Results:') - for key, item in midterm_agg.items(): + for key, item in norm_midterm_agg.items(): logger.info(' {0}: {1}'.format(key, item)) logger.info('') logger.info('Final Analysis Results:') - for key, item in final_agg.items(): + for key, item in norm_final_agg.items(): logger.info(' {0}: {1}'.format(key, item)) logger.info('') diff --git a/resources/stat_functions.py b/resources/stat_functions.py index 30788d6b4fe019633a43342aba618bd4d67fa35c..1f29bd4be5358a78c96313ff120a435da42eabbc 100644 --- a/resources/stat_functions.py +++ b/resources/stat_functions.py @@ -7,7 +7,7 @@ Larger datasets (where efficiency and runtime matter) should probably instead us """ # System Imports. -import copy, numpy +import copy, math, numpy from scipy import stats # User Imports. @@ -421,3 +421,74 @@ class Normalizations(): Alias for "zero_mean" normalization. """ return Normalizations.zero_mean(struct, attribute=attribute, display=display) + + +class RelationalAnalysis(): + @staticmethod + def covariance(struct, attribute_1, attribute_2, display=True): + """ + Finds the covariance of two attributes (keys) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute_1: First attribute to calculate on. + :param attribute_2: Second attribute to calculate on. + :param display: Bool indicating if helper text should display. + :return: Covariance of dict attributes. + """ + if display: + logger.info('Finding covariance of "{0}" and "{1}".'.format(attribute_1, attribute_2)) + + # Get required additional data. + attr_count = Aggregations.count(struct, attribute_1, display=False) + attr_1_mean = Aggregations.mean(struct, attribute_1, display=False) + attr_2_mean = Aggregations.mean(struct, attribute_2, display=False) + + # Get equation sum. + attr_sum = 0 + for index in range(len(struct)): + attr_1 = struct[index][attribute_1] + attr_2 = struct[index][attribute_2] + attr_sum += (attr_1 - attr_1_mean) * (attr_2 - attr_2_mean) + + # Return final result. + return attr_sum / (attr_count - 1) + + @staticmethod + def pearsons_correlation_coefficient(struct, attribute_1, attribute_2, display=True): + """ + Finds the covariance of two attributes (keys) for the provided dict. + :param struct: Data structure to aggregate on. + :param attribute_1: First attribute to calculate on. + :param attribute_2: Second attribute to calculate on. + :param display: Bool indicating if helper text should display. + :return: Covariance of dict attributes. + """ + if display: + logger.info('Finding Pearson\'s Correlation Coefficient of "{0}" and "{1}".'.format(attribute_1, attribute_2)) + + # Get required additional data. + attr_1_sum = 0 + attr_2_sum = 0 + attr_1_squared_sum = 0 + attr_2_squared_sum = 0 + mult_sum = 0 + attr_count = 0 + for index in range(len(struct)): + attr_1 = struct[index][attribute_1] + attr_2 = struct[index][attribute_2] + + attr_1_sum += attr_1 + attr_2_sum += attr_2 + attr_1_squared_sum += (attr_1 ** 2) + attr_2_squared_sum += (attr_2 ** 2) + mult_sum += attr_1 * attr_2 + attr_count += 1 + + # Calculate full equation. + numerator = (attr_count * mult_sum) - (attr_1_sum * attr_2_sum) + dominator = math.sqrt( + ((attr_count * attr_1_squared_sum) - (attr_1_sum ** 2)) * + ((attr_count * attr_2_squared_sum) - (attr_2_sum ** 2)) + ) + + # Return final result. + return (numerator / dominator)