From 9cd7b34f0cfbe6dbf62fa3b422c53e311de1f258 Mon Sep 17 00:00:00 2001
From: Brandon Rodriguez <brodriguez8774@gmail.com>
Date: Tue, 19 May 2020 01:37:43 -0400
Subject: [PATCH] Implement cosine similarity and kl divergence calculations

---
 documents/references.md     |  15 ++++++
 main.py                     |  80 ++++++++++++++++++++++++++-
 resources/stat_functions.py | 105 ++++++++++++++++++++++++++++++++++--
 3 files changed, 196 insertions(+), 4 deletions(-)

diff --git a/documents/references.md b/documents/references.md
index 45e4595..a72217f 100644
--- a/documents/references.md
+++ b/documents/references.md
@@ -41,6 +41,7 @@ Scipy is a scientific library that seems to now be generally associated with Num
 * Python Array to Numpy Array: <https://stackoverflow.com/questions/10121926/initialise-numpy-array-of-unknown-length>
 * Reshaping array Dimensions: <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html>
 * Checking for "Numpy Array" type: <https://stackoverflow.com/a/12569453>
+* Dot Product of Numpy Arrays: <https://www.tutorialspoint.com/numpy/numpy_dot.htm>
 
 #### Numpy Aggregation Functions
 <https://jakevdp.github.io/PythonDataScienceHandbook/02.04-computation-on-arrays-aggregates.html>
@@ -56,5 +57,19 @@ Looks like this isn't built into scipy/numpy. Have to create it ourselves, which
 #### Finding the Mode of Data
 <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html>
 
+#### Minkowski Distance
+<https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.minkowski.html>
+<https://stackoverflow.com/a/48876998>
+
 #### Chi-Square Calculation
 <https://towardsdatascience.com/running-chi-square-tests-in-python-with-die-roll-data-b9903817c51b>
+
+#### Cosine Similiarity
+<https://www.machinelearningplus.com/nlp/cosine-similarity/>
+
+#### Kullback Leibler Divergence
+* How it works: <https://www.youtube.com/watch?v=LJwtEaP2xKA>
+* Code example and further exlanation:
+<https://machinelearningmastery.com/divergence-between-probability-distributions/>
+* Normalizing record such that values sum to 1:
+<https://math.stackexchange.com/questions/278418/normalize-values-to-sum-1-but-keeping-their-weights>
diff --git a/main.py b/main.py
index 053ecdf..1c9de60 100644
--- a/main.py
+++ b/main.py
@@ -4,7 +4,7 @@ Covers basic statistical measurements in data mining.
 
 # System Imports.
 import numpy
-from scipy import stats
+from scipy.spatial import minkowski_distance
 
 # User Imports.
 from resources import logging as init_logging
@@ -30,6 +30,8 @@ def test():
     Should probably make this into unit testing but then I'd feel obligated to unittest all my logic.
     Also this assignment is due soon and I've spent the past week straight relearning statistics,
     so I just don't have time to make proper unittests. Rip.
+
+    ...and then I proceeded to make pseudo-tests for almost all logic after initially writing the above comment. Lol.
     """
     # Test z-score normalization against https://youtu.be/5S-Zfa-vOXs
     struct = [
@@ -126,6 +128,61 @@ def test():
 
     RelationalAnalysis.chi_square_independence(data_array)
 
+    logger.info('')
+    logger.info('')
+
+    # Test cosine similarity against class book example (ch2, pg 78).
+    data_array = [
+        {'x': 5, 'y': 3},
+        {'x': 0, 'y': 0},
+        {'x': 3, 'y': 2},
+        {'x': 0, 'y': 0},
+        {'x': 2, 'y': 1},
+        {'x': 0, 'y': 1},
+        {'x': 0, 'y': 0},
+        {'x': 2, 'y': 1},
+        {'x': 0, 'y': 0},
+        {'x': 0, 'y': 1},
+    ]
+
+    RelationalAnalysis.cosine_similarity(data_array, 'x', 'y')
+
+    logger.info('')
+    logger.info('')
+
+    # Test kullback-leibler divergence against references (see documents/references.md)
+    # Test case 1.
+    data_array = [
+        {'red': 0.10, 'green': 0.40, 'blue': 0.50},
+        {'red': 0.80, 'green': 0.15, 'blue': 0.05},
+    ]
+    RelationalAnalysis.kl_divergence(data_array, 0, 1)
+    RelationalAnalysis.kl_divergence(data_array, 1, 0)
+
+    # Test normalization too.
+    data_array = [
+        {'red': 10, 'green': 40, 'blue': 50},
+        {'red': 80, 'green': 15, 'blue': 5},
+    ]
+    RelationalAnalysis.kl_divergence(data_array, 0, 1)
+    RelationalAnalysis.kl_divergence(data_array, 1, 0)
+
+    # Test case 2.
+    data_array = [
+        {'x': 1/2, 'y': 1/4, 'z': 1/4},
+        {'x': 1/4, 'y': 1/2, 'z': 1/4},
+    ]
+    RelationalAnalysis.kl_divergence(data_array, 0, 1)
+    RelationalAnalysis.kl_divergence(data_array, 1, 0)
+
+    # Test normalization too.
+    data_array = [
+        {'x': 5, 'y': 2.5, 'z': 2.5},
+        {'x': 2.5, 'y': 5, 'z': 2.5},
+    ]
+    RelationalAnalysis.kl_divergence(data_array, 0, 1)
+    RelationalAnalysis.kl_divergence(data_array, 1, 0)
+
 
 def calc_part_1():
     """
@@ -315,6 +372,27 @@ def calc_part_3():
             logger.info('    {0}'.format(item))
         logger.info('')
 
+        # Get Jaccard Coefficient.
+        # Defined by [ q / (q + r + s) ]
+        # Where:
+        #   q - Items in common.
+        #   r - Items only in set 1.
+        #   s - Items only in set 2.
+        jaccard_coeff = (58)/(58 + 2 + 120)
+        logger.info('Jaccard Coefficient: {0}'.format(jaccard_coeff))
+        logger.info('')
+
+        # Get minkowski distances.
+        cml_array = [x['CML'] for x in lib_array]
+        cbl_array = [x['CBL'] for x in lib_array]
+
+        mink_dist_1 = minkowski_distance(cml_array, cbl_array, 1)
+        mink_dist_2 = minkowski_distance(cml_array, cbl_array, 2)
+        mink_dist_inf = minkowski_distance(cml_array, cbl_array, float('inf'))
+        logger.info('minkowski distance of 1: {0}'.format(mink_dist_1))
+        logger.info('minkowski distance of 2: {0}'.format(mink_dist_2))
+        logger.info('minkowski distance of inf: {0}'.format(mink_dist_inf))
+
 
 def calc_part_4():
     """
diff --git a/resources/stat_functions.py b/resources/stat_functions.py
index 217b5bd..4089dab 100644
--- a/resources/stat_functions.py
+++ b/resources/stat_functions.py
@@ -428,7 +428,7 @@ class RelationalAnalysis():
     def covariance(struct, attribute_1, attribute_2, display=True):
         """
         Finds the covariance of two attributes (keys) for the provided dict.
-        :param struct: Data structure to aggregate on.
+        :param struct: Data structure to calculate on.
         :param attribute_1: First attribute to calculate on.
         :param attribute_2: Second attribute to calculate on.
         :param display: Bool indicating if helper text should display.
@@ -456,7 +456,7 @@ class RelationalAnalysis():
     def pearsons_correlation_coefficient(struct, attribute_1, attribute_2, display=True):
         """
         Finds the covariance of two attributes (keys) for the provided dict.
-        :param struct: Data structure to aggregate on.
+        :param struct: Data structure to calculate on.
         :param attribute_1: First attribute to calculate on.
         :param attribute_2: Second attribute to calculate on.
         :param display: Bool indicating if helper text should display.
@@ -498,7 +498,7 @@ class RelationalAnalysis():
         """
         Parses values of two attributes (keys) from the provided dict. Formats for a chi-square analysis.
         Then finds the chi-square independence values of two attributes (keys) for the provided dict.
-        :param struct: Data structure to aggregate on.
+        :param struct: Data structure to calculate on.
         :param attribute_1: First attribute to calculate on.
         :param attribute_2: Second attribute to calculate on.
         :param display: Bool indicating if helper text should display.
@@ -574,6 +574,8 @@ class RelationalAnalysis():
         :param numpy_array: Numpy array to calculate on.
         :param display: Bool indicating if helper text should display.
         :param print_results: Bool indicating if results should display.
+        :param display: Bool indicating if helper text should display.
+        :param print_results: Bool indicating if results should display.
         :return: The associated chi-square values for the data set attributes.
             chi_stat: The value from calculating on all table cells.
             p_val: The resultant p-value, acquired from combination of chi_stat and dof.
@@ -602,3 +604,100 @@ class RelationalAnalysis():
 
         # Return final values.
         return chi_stat, p_val, dof, ex
+
+    @staticmethod
+    def cosine_similarity(struct, attribute_1, attribute_2, display=True, print_results=True):
+        """
+        Finds the cosine similarity of two attributes (keys) for the provided dict.
+        :param struct: Data structure to calculate on.
+        :param attribute_1: First attribute to calculate on.
+        :param attribute_2: Second attribute to calculate on.
+        :param display: Bool indicating if helper text should display.
+        :param print_results: Bool indicating if results should display.
+        :return: The associated divergence value.
+        """
+        if display:
+            logger.info('Finding cosine similarity of "{0}" and "{1}".'.format(attribute_1, attribute_2))
+
+        # Get required additional data.
+        attr_1_array = None
+        attr_2_array = None
+        attr_1_squared_sum = 0
+        attr_2_squared_sum = 0
+        for index in range(len(struct)):
+            attr_1 = struct[index][attribute_1]
+            attr_2 = struct[index][attribute_2]
+
+            if attr_1_array is None:
+                attr_1_array = numpy.array(attr_1)
+            else:
+                attr_1_array = numpy.append(attr_1_array, attr_1)
+
+            if attr_2_array is None:
+                attr_2_array = numpy.array(attr_2)
+            else:
+                attr_2_array = numpy.append(attr_2_array, attr_2)
+
+            attr_1_squared_sum += attr_1 ** 2
+            attr_2_squared_sum += attr_2 ** 2
+
+        # Calculate value.
+        cosine_similarity = (
+            numpy.dot(attr_1_array, attr_2_array) / (math.sqrt(attr_1_squared_sum) * math.sqrt(attr_2_squared_sum))
+        )
+
+        if print_results:
+            logger.info('Cosine Similarity: {0}'.format(cosine_similarity))
+
+        # Return final result.
+        return cosine_similarity
+
+    @staticmethod
+    def kl_divergence(struct, index_1, index_2, display=True, print_results=True):
+        """
+        Finds the kullback-leibler divergence of two records for the provided dict.
+        :param struct: Data structure to calculate on.
+        :param index_1: First record/distribution to calculate on.
+        :param index_2: Second record/distribution to calculate on.
+        :param display: Bool indicating if helper text should display.
+        :param print_results: Bool indicating if results should display.
+        :return: The associated divergence value.
+        """
+        if display:
+            logger.info('Finding kullback-leibler divergance of "{0}" and "{1}".'.format(index_1, index_2))
+
+        # Get sums of values.
+        index_1_sum = 0
+        index_2_sum = 0
+        for key in struct[index_1].keys():
+            index_1_sum += struct[index_1][key]
+            index_2_sum += struct[index_2][key]
+
+        # Check sums.
+        if index_1_sum != 1 and index_2_sum != 1:
+            # Neither record sums to 1. Normalize both.
+            for key in struct[index_1].keys():
+                struct[index_1][key] = (struct[index_1][key] / index_1_sum)
+                struct[index_2][key] = (struct[index_2][key] / index_1_sum)
+        elif index_1_sum != 1:
+            # First record does not sum to 1. Normalize.
+            for key in struct[index_1].keys():
+                struct[index_1][key] = (struct[index_1][key] / index_1_sum)
+        elif index_2_sum != 1:
+            # Second record does not sum to 1. Normalize.
+            for key in struct[index_2].keys():
+                struct[index_2][key] = (struct[index_2][key] / index_1_sum)
+
+        # Calculate value.
+        kl_divergence = 0
+        for key in struct[index_1].keys():
+            dist_1_val = struct[index_1][key]
+            dist_2_val = struct[index_2][key]
+
+            kl_divergence += (dist_1_val * math.log2(dist_1_val / dist_2_val))
+
+        if print_results:
+            logger.info('Kullback-Leibler Divergence: {0}'.format(kl_divergence))
+
+        # Return final value.
+        return kl_divergence
-- 
GitLab