# MIT 6.034 Lab 5: k-Nearest Neighbors and Identification Trees

from api import *

features = [\
    "Age", #int
    "Sex", #M or F
    "Chest pain type", #typical angina, atypical angina, non-anginal pain, asymptomatic (note angina = chest pain)
    "Resting blood pressure", #int
    "Cholesterol level", #int
    "Is fasting blood sugar < 120 mg/dl", #Yes or No
    "Resting EKG type", #normal, wave abnormality, ventricular hypertrophy
    "Maximum heart rate", #int
    "Does exercise cause chest pain?", #Yes or No
    "ST depression induced by exercise", #int
    "Slope type", #up, flat, down
    "# of vessels colored", #float, ?
    "Thal type", #normal, fixed defect, reversible defect, unknown
    "Heart disease presence", #healthy or diseased
    "Heart disease level", #int 0 (healthy, no presence) to 4
]

intify = lambda s: int(float(s))

heart_training_data = []
patient_number = 1

with open('cleveland_medical_data.txt') as f:
    for line in f:
        if (not line) or line[0] in "\n\r\n" or line[0] == "%":
            continue
        values = line.strip("\n\r").split()
        if len(values) < 15:
            print('Error: line shorter than expected; skipping line...')
            continue
        person = {}
        person["name"] = "patient" + str(patient_number)
        person["Age"] = intify(values[0]) #int (in years)
        person["Sex"] = {"male":"M", "fem":"F"}[values[1]] #M or F
        person["Chest pain type"] = {"angina":"typical angina", "abnang":"atypical angina",
                                     "notang":"non-anginal pain", "asympt":"asymptomatic"}[values[2]]
                                     #typical angina, atypical angina, non-anginal pain, asymptomatic (note angina = chest pain)
        person["Resting blood pressure"] = intify(values[3]) #int (in mm Hg)
        person["Cholesterol level"] = intify(values[4]) #int (in mg/dl)
        person["Is fasting blood sugar < 120 mg/dl"] = {"true":"Yes", "fal":"No"}[values[5]] #Yes or No
        person["Resting EKG type"] = {"norm":"normal", "abn":"wave abnormality",
                                      "hyp":"ventricular hypertrophy"}[values[6]]
                                      #normal, wave abnormality, ventricular hypertrophy
        person["Maximum heart rate"] = intify(values[7]) #int
        person["Does exercise cause chest pain?"] = {"true":"Yes", "fal":"No"}[values[8]] #Yes or No
        person["ST depression induced by exercise"] = intify(values[9]) #int "ST depression induced by exercise relative to rest"
        person["Slope type"] = values[10] #up, flat, down
        person["# of vessels colored"] = "?" if values[11] == "?" else float(values[11]) #float or ? #number of major vessels (0-3) colored by flourosopy
        person["Thal type"] = {"norm":"normal", "fix":"fixed defect", "rev":"reversible defect", "?":"unknown"}[values[12]]
            #normal, fixed defect, reversible defect, unknown
        person["Heart disease presence"] = {"buff":"healthy", "sick":"diseased"}[values[13]] #healthy or diseased
        person["Heart disease level"] = {"H":0, "S1":1, "S2":2, "S3":3, "S4":4}[values[14]] #int 0 (healthy, no presence) to 4
        heart_training_data.append(person)
        patient_number += 1


heart_classifiers = [\
    feature_test(features[1]),
    feature_test(features[2]),
    feature_test(features[5]),
    feature_test(features[6]),
    feature_test(features[8]),
    feature_test(features[10]),
    feature_test(features[12]),
]

def threshold_test_with_unknown(feature, threshold) :
    def classify_function(pt):
        val = pt.get(feature)
        if val == '?':
            return '?'
        if maybe_number(val) > threshold:
            return 'Yes'
        return 'No'
    return Classifier(feature + " > " + str(threshold) + " (or ?)", classify_function)

def all_midpoint_tests(feature_name, data):
    """Creates threshold tests for the feature, one for each midpoint value.
    If '?' is a value, creates 3-option threshold tests with Yes/No/?.
    Returns a list of Classifier objects."""
    test_making_fn = threshold_test
    tests = []
    values = set([p[feature_name] for p in data])
    if '?' in values:
        values.remove('?')
        test_making_fn = threshold_test_with_unknown
    values = sorted(list(values))

    for (v1,v2) in zip(values[:-1], values[1:]):
        tests.append(test_making_fn(feature_name, (v1+v2)/2.))
    return tests

heart_classifiers.extend(all_midpoint_tests(features[0], heart_training_data))
heart_classifiers.extend(all_midpoint_tests(features[3], heart_training_data))
heart_classifiers.extend(all_midpoint_tests(features[4], heart_training_data))
heart_classifiers.extend(all_midpoint_tests(features[7], heart_training_data))
heart_classifiers.extend(all_midpoint_tests(features[9], heart_training_data))
heart_classifiers.extend(all_midpoint_tests(features[11], heart_training_data))


heart_target_classifier_binary = feature_test(features[13])
heart_target_classifier_discrete = feature_test(features[14])


# If you want to try using k-nearest neighbors on this data, uncomment the lines
# below.  Note that one of the numeric features (# of vessels colored) is not
# included due to the possible '?' value.  Also note that you'll probably need
# to normalize the data for each feature, e.g. by dividing by the standard
# deviation of that feature's values, in order to get results that consider
# all features relatively equally.

# knn_heart_training_data = []
# numeric_features = [features[i] for i in [0,3,4,7,9]] #note: feature 11 (# of vessels colored) not included due to non-numeric '?'
# for person in heart_training_data:
#    point = Point([person[attr] for attr in numeric_features], #todo: normalize data for each feature
#                  classification=person[features[13]], #or 14 for discrete classification
#                  name=person['name'])
#    knn_heart_training_data.append(point)

# print(knn_heart_training_data[-1]) #uncomment to print a sample Point


################################################################################
################################################################################

# ANSWERS TO QUESTIONS FROM LAB 5 WIKI PAGE:

# Q: Does this seem like the simplest possible tree?  If not, why not?
# A: No, it is probably overfitting and giving too much weight to outliers in
#    the data.  For example, many leaf nodes correspond to only 1-2 patients.

# Q: What could we change to make the tree simpler?
# A: One possible change is to modify the stopping condition.  Rather than
#    continuing to add classifiers until the data is perfectly separated, we
#    could stop when a node reaches some homogeneity threshold, such as 90%,
#    then assign the node the classification of the majority of the points.
#    This method would reduce overfitting, although in the extreme case, it
#    could result in underfitting (e.g. if you use a threshold of 51%, or 90%
#    but with a very small dataset).

# Q: What do you notice when you print the training data at the leaf nodes?
# A: One interesting observation is that there are many nodes that have only a
#    few training points, while there are a few nodes that have a large number
#    of training points.  The nodes with many training points are probably
#    reliable classifications, but those with few training points may be
#    misleading results due to overfitting.

# Q: Try using the other target classifier (binary or discrete).  Is the tree
#    simpler or more complicated?  Why?
# A: The binary-classification tree is simpler because it doesn't need to
#    separate the data as much.  The discrete-classification tree is more
#    complicated because nodes that contain only diseased patients don't count
#    as homogeneous if those patients have different levels of heart disease.

# Q: Try creating and classifying a few patients.  Are the results consistent
#    with your expectations?
# A: Expectations may vary, of course, but we've found that, in general,
#    patients who seem healthy (normal values for all features, as in the sample
#    test_patient) get classified as 'healthy', and patients who seem unhealthy
#    get classified as 'diseased'.  However, there are some exceptions...

# Q: What happens if a female patient has 'Thal type': 'unknown'?
# A: She gets classified as healthy (or 0) because the first test in the ID tree
#    is 'Thal type', and the first (and only) test for 'unknown' is 'Sex', with
#    'F' indicating healthy/0.

# Q: What happens if a male patient has 'Thal type': 'unknown'?
# A: He gets classified as diseased (or 2) because the first test in the ID tree
#    is 'Thal type', and the first (and only) test for 'unknown' is 'Sex', with
#    'M' indicating diseased/2.

# Q: What causes the surprising result with 'Thal type': 'unknown'?
# A: There are only two patients in the data set with 'Thal type': 'unknown'.
#    It just happens that one is female and healthy, while the other is male and
#    diseased.  This is an excellent example of overfitting.

# Q: What could we change to improve the classification accuracy of patients
#    with unknown Thal type?
# A: There are many possible changes, including (1) gather more data on patients
#    with unknown Thal type, or (2) ignore the 'Thal type' test altogether and
#    construct a tree using the remaining tests.

# Q: In the discrete classification tree, what happens if a patient has:
#        'Thal type': 'normal',
#        'Chest pain type': 'asymptomatic',
#        '# of vessels colored': '?'
#    ...and why does it happen?
# A: The first test in the ID tree is 'Thal type', the first test for 'normal'
#    is 'Chest pain type', and the first test for 'asymptomatic' is '# of
#    vessels colored > 0.5 (or ?)', but that node (# vessels) has only two
#    branches, 'Yes' and 'No', so the patient with '?' cannot be classified.
#    This problem arises because all of the patients in the training data with
#    'Thal type': 'normal' and 'Chest pain type': 'asymptomatic' also had a
#    known number of vessels colored (not '?').

# Q: Why might the issue with '# of vessels colored': '?' cause a problem for
#    classifying real patients?  What can we do to fix this issue?
# A: Many patients, especially healthy ones, may not have undergone the
#    procedure for determining how many major vessels are colored by flourosopy,
#    and it would be nice to be able to classify them without that information.
#    As with the 'Thal type' problem, solutions include (1) gathering more
#    information on patients with an unknown number of vessels colored, or (2)
#    ignoring the test altogether.  A third option is to assign these patients
#    the classification corresponding to the majority of patients who have 'Thal
#    type': 'normal' and 'Chest pain type': 'asymptomatic' (in this case,
#    healthy/0).