"""
Covers basic statistical measurements in data mining.
"""

# System Imports.
import numpy
from scipy.spatial import minkowski_distance

# User Imports.
from resources import logging as init_logging
from resources.stat_functions import Aggregations, Normalizations, RelationalAnalysis


# Initialize Logger.
logger = init_logging.get_logger(__name__)


def main():
    """
    Program main.
    """
    calc_part_1()
    calc_part_2()
    calc_part_3()
    calc_part_4()


def test():
    """
    Should probably make this into unit testing but then I'd feel obligated to unittest all my logic.
    Also this assignment is due soon and I've spent the past week straight relearning statistics,
    so I just don't have time to make proper unittests. Rip.

    ...and then I proceeded to make pseudo-tests for almost all logic after initially writing the above comment. Lol.
    """
    # Test z-score normalization against https://youtu.be/5S-Zfa-vOXs
    struct = [
        {'x': 2},
        {'x': 2},
        {'x': 3},
        {'x': 2},
        {'x': 5},
        {'x': 1},
        {'x': 6},
    ]
    result = Normalizations.z_score(struct, 'x')
    logger.info('{0}'.format(result))

    logger.info('')
    logger.info('')

    # Test covariance against https://youtu.be/0nZT9fqr2MU
    struct = [
        {'x': 2.1, 'y': 8},
        {'x': 2.5, 'y': 10},
        {'x': 3.6, 'y': 12},
        {'x': 4.0, 'y': 14},
    ]

    result = RelationalAnalysis.covariance(struct, 'x', 'y')
    logger.info('{0}'.format(result))

    logger.info('')
    logger.info('')

    # Test correlation coefficient against https://youtu.be/jBQz2RGxCek
    struct = [
        {'x': 43, 'y': 99},
        {'x': 21, 'y': 65},
        {'x': 25, 'y': 79},
        {'x': 42, 'y': 75},
        {'x': 57, 'y': 87},
        {'x': 59, 'y': 81},
    ]

    result = RelationalAnalysis.pearsons_correlation_coefficient(struct, 'x', 'y')
    logger.info('{0}'.format(result))

    logger.info('')
    logger.info('')

    # Test chi-square table creation.
    # Desired end-result:
    #
    # data_array = numpy.array([
    #     [5, 6],
    #     [6, 7],
    # ])

    data_array = [
        {'a': True, 'b': True},
        {'a': True, 'b': True},
        {'a': True, 'b': True},
        {'a': True, 'b': True},
        {'a': True, 'b': True},
        {'a': True, 'b': False},
        {'a': True, 'b': False},
        {'a': True, 'b': False},
        {'a': True, 'b': False},
        {'a': True, 'b': False},
        {'a': True, 'b': False},
        {'a': False, 'b': True},
        {'a': False, 'b': True},
        {'a': False, 'b': True},
        {'a': False, 'b': True},
        {'a': False, 'b': True},
        {'a': False, 'b': True},
        {'a': False, 'b': False},
        {'a': False, 'b': False},
        {'a': False, 'b': False},
        {'a': False, 'b': False},
        {'a': False, 'b': False},
        {'a': False, 'b': False},
        {'a': False, 'b': False},
    ]

    RelationalAnalysis.chi_square_independence_parse(data_array, 'a', 'b')

    logger.info('')
    logger.info('')

    # Test chi-square calculation against https://youtu.be/zOvUQWOzTlc.
    data_array = numpy.array([
        [11, 3, 8],
        [2, 9, 14],
        [12, 13, 28],
    ])

    RelationalAnalysis.chi_square_independence(data_array)

    logger.info('')
    logger.info('')

    # Test cosine similarity against class book example (ch2, pg 78).
    data_array = [
        {'x': 5, 'y': 3},
        {'x': 0, 'y': 0},
        {'x': 3, 'y': 2},
        {'x': 0, 'y': 0},
        {'x': 2, 'y': 1},
        {'x': 0, 'y': 1},
        {'x': 0, 'y': 0},
        {'x': 2, 'y': 1},
        {'x': 0, 'y': 0},
        {'x': 0, 'y': 1},
    ]

    RelationalAnalysis.cosine_similarity(data_array, 'x', 'y')

    logger.info('')
    logger.info('')

    # Test kullback-leibler divergence against references (see documents/references.md)
    # Test case 1.
    data_array = [
        {'red': 0.10, 'green': 0.40, 'blue': 0.50},
        {'red': 0.80, 'green': 0.15, 'blue': 0.05},
    ]
    RelationalAnalysis.kl_divergence(data_array, 0, 1)
    RelationalAnalysis.kl_divergence(data_array, 1, 0)

    # Test normalization too.
    data_array = [
        {'red': 10, 'green': 40, 'blue': 50},
        {'red': 80, 'green': 15, 'blue': 5},
    ]
    RelationalAnalysis.kl_divergence(data_array, 0, 1)
    RelationalAnalysis.kl_divergence(data_array, 1, 0)

    # Test case 2.
    data_array = [
        {'x': 1/2, 'y': 1/4, 'z': 1/4},
        {'x': 1/4, 'y': 1/2, 'z': 1/4},
    ]
    RelationalAnalysis.kl_divergence(data_array, 0, 1)
    RelationalAnalysis.kl_divergence(data_array, 1, 0)

    # Test normalization too.
    data_array = [
        {'x': 5, 'y': 2.5, 'z': 2.5},
        {'x': 2.5, 'y': 5, 'z': 2.5},
    ]
    RelationalAnalysis.kl_divergence(data_array, 0, 1)
    RelationalAnalysis.kl_divergence(data_array, 1, 0)


def calc_part_1():
    """
    Logic for "part 1" of assignment.
    """
    logger.info('')
    logger.info('=========================')
    logger.info('====      Part 1     ====')
    logger.info('=========================')
    logger.info('')

    student_array = []

    # Open file.
    with open('./documents/data/data.online.scores.txt', 'r') as student_scores_file:
        # Read in each line and save to our struct.
        for line in student_scores_file:
            # Split on tabs.
            split_line = line.strip().split('\t')

            # Save to struct.
            student_array.append({
                'id': int(split_line[0]),
                'midterm': int(split_line[1]),
                'final': int(split_line[2]),
            })

            # logger.info(student_array[len(student_array) - 1])

    # Get aggregation values.
    id_agg = Aggregations.all(student_array, 'id', display=False)
    midterm_agg = Aggregations.all(student_array, 'midterm', display=False)
    final_agg = Aggregations.all(student_array, 'final', display=False)

    # Display results.
    logger.info('')
    logger.info('Id Analysis Results (for debugging):')
    for key, item in id_agg.items():
        logger.info('    {0}: {1}'.format(key, item))
    logger.info('')
    logger.info('Midterm Analysis Results:')
    for key, item in midterm_agg.items():
        logger.info('    {0}: {1}'.format(key, item))
    logger.info('')
    logger.info('Final Analysis Results:')
    for key, item in final_agg.items():
        logger.info('    {0}: {1}'.format(key, item))
    logger.info('')


def calc_part_2():
    """
    Logic for "part 2" of assignment.
    """
    logger.info('')
    logger.info('=========================')
    logger.info('====      Part 2     ====')
    logger.info('=========================')
    logger.info('')

    student_array = []

    # Open file.
    with open('./documents/data/data.online.scores.txt', 'r') as student_scores_file:
        # Read in each line and save to our struct.
        for line in student_scores_file:
            # Split on tabs.
            split_line = line.strip().split('\t')

            # Save to struct.
            student_array.append({
                'id': int(split_line[0]),
                'midterm': int(split_line[1]),
                'final': int(split_line[2]),
            })

            # logger.info(student_array[len(student_array) - 1])

    # Normalize data.
    norm_student_array = Normalizations.z_score(student_array, attribute='midterm')
    norm_student_array = Normalizations.z_score(norm_student_array, attribute='final')

    # Get aggregation values.
    norm_id_agg = Aggregations.all(Normalizations.z_score(student_array, attribute='id'), 'id', display=False)
    norm_midterm_agg = Aggregations.all(norm_student_array, 'midterm', display=False)
    norm_final_agg = Aggregations.all(norm_student_array, 'final', display=False)
    id_agg = Aggregations.all(student_array, 'id', display=False)
    midterm_agg = Aggregations.all(student_array, 'midterm', display=False)
    final_agg = Aggregations.all(student_array, 'final', display=False)

    # Display results.
    logger.info('')
    logger.info('Id Analysis Results (for debugging):')
    for key, item in norm_id_agg.items():
        logger.info('    {0}: {1}'.format(key, item))
    logger.info('')
    logger.info('Midterm Analysis Results:')
    for key, item in norm_midterm_agg.items():
        logger.info('    {0}: {1}'.format(key, item))
    logger.info('')
    logger.info('Final Analysis Results:')
    for key, item in norm_final_agg.items():
        logger.info('    {0}: {1}'.format(key, item))
    logger.info('')

    # Id 223 has both a midterm of 90 and final of 90.
    # Id 841 has lowest midterm of 35 and lowest final of 37.
    # Id 117 has highest midterm of 100 and highest final of 100.
    logger.info('')
    logger.info('Orig vs Normalization:')
    logger.info('')
    logger.info('Midterm/Final of 90:')
    logger.info('    Orig:       {0}'.format(student_array[223]))
    logger.info('    Normalized: {0}'.format(norm_student_array[223]))
    logger.info('')
    logger.info('Lowest Midterm/Final:')
    logger.info('    Orig:       {0}'.format(student_array[841]))
    logger.info('    Normalized: {0}'.format(norm_student_array[841]))
    logger.info('')
    logger.info('Highest Midterm/Final:')
    logger.info('    Orig:       {0}'.format(student_array[117]))
    logger.info('    Normalized: {0}'.format(norm_student_array[117]))
    logger.info('')

    # Pearson's Correlation Coefficient.
    pearson_coeff = RelationalAnalysis.pearsons_correlation_coefficient(student_array, 'midterm', 'final')
    logger.info('Person\'s Correlation Coeficient: {0}'.format(pearson_coeff))
    logger.info('')

    # Covariance.
    covariance = RelationalAnalysis.covariance(student_array, 'midterm', 'final')
    logger.info('Covariance: {0}'.format(covariance))
    logger.info('')


def calc_part_3():
    """
    Logic for "part 3" of assignment.
    """
    logger.info('')
    logger.info('=========================')
    logger.info('====      Part 3     ====')
    logger.info('=========================')
    logger.info('')

    lib_array = []

    # Open file.
    with open('./documents/data/data.libraries.inventories.txt', 'r') as student_scores_file:
        # Read in each line and save to our struct.
        for line in student_scores_file:
            # Split on tabs.
            split_line = line.strip().split('\t')

            # Loop through all line values.
            for line_val in split_line:

                # Handle for each major case.
                if line_val.strip() == 'library':
                    data_type = 1
                    index = 0
                elif line_val.strip() == 'CML':
                    data_type = 2
                    index = 0
                elif line_val.strip() == 'CBL':
                    data_type = 3
                    index = 0
                else:
                    # Handle general case.
                    if data_type == 1:
                        # Is book id.
                        lib_array.append({
                            'id': int(line_val.strip().split('_')[1])
                        })
                    elif data_type == 2:
                        # Is CML library data.
                        lib_array[index]['CML'] = int(line_val.strip())
                        index += 1
                    elif data_type == 3:
                        # Is CBL library data.
                        lib_array[index]['CBL'] = int(line_val.strip())
                        index += 1

        # Get Jaccard Coefficient.
        # Defined by [ q / (q + r + s) ]
        # Where:
        #   q - Items in common.
        #   r - Items only in set 1.
        #   s - Items only in set 2.
        jaccard_coeff = (58)/(58 + 2 + 120)
        logger.info('Jaccard Coefficient: {0}'.format(jaccard_coeff))
        logger.info('')

        # Get minkowski distances.
        cml_array = [x['CML'] for x in lib_array]
        cbl_array = [x['CBL'] for x in lib_array]

        mink_dist_1 = minkowski_distance(cml_array, cbl_array, 1)
        mink_dist_2 = minkowski_distance(cml_array, cbl_array, 2)
        mink_dist_inf = minkowski_distance(cml_array, cbl_array, float('inf'))
        logger.info('minkowski distance of 1: {0}'.format(mink_dist_1))
        logger.info('minkowski distance of 2: {0}'.format(mink_dist_2))
        logger.info('minkowski distance of inf: {0}'.format(mink_dist_inf))

        # Get cosine similarity.
        cosine_similarity = RelationalAnalysis.cosine_similarity(lib_array, 'CML', 'CBL')

        # Get kl divergence.
        cml_dict = {}
        cbl_dict = {}
        for index in range(len(cml_array)):
            cml_dict[index] = cml_array[index]
            cbl_dict[index] = cbl_array[index]

        adjusted_lib_array = [cml_dict, cbl_dict]
        kl_divergence = RelationalAnalysis.kl_divergence(adjusted_lib_array, 0, 1)


def calc_part_4():
    """
    Logic for "part 4" of assignment.
    """
    logger.info('')
    logger.info('=========================')
    logger.info('====      Part 4     ====')
    logger.info('=========================')
    logger.info('')
    data_array = numpy.array([
        [150, 40],
        [15, 3300],
    ])
    RelationalAnalysis.chi_square_independence(data_array)


if __name__ == '__main__':
    logger.info('Starting program.')
    main()
    logger.info('Terminating program.')