# author: Michelle Szucs (mszucs)
import string
from datetime import *
import pylab

path = "raw_data.tsv"

"""
Question: We want to make some plots - or at least print out
information - about the responses to the student poll. How can
we break this problem up into smaller subunits? What kinds of
tasks do we need to perform to analyze the data?
"""

def load_data(path):
    """
    Question: How should we model this data? We have a
    series of questions, for which we have multiple
    responses. What are different ways we can model the data?
    """

    """
    I chose to make a list of dictionaries. Each element of the
    list (i.e., each dictionary) represents a different student's response.

    Each key in the dictionary is a string corresponding to a
    different question. question_mapping.txt lays out what the
    questions are called.

    The value associated with the key is the student's response to
    that question.
    """
    # Start by defining the question labels in the order they're saved
    # in the file

    labels = ["timestamp", "checkoff2_start", "checkoff2_end", "checkoff3_start",
              "checkoff3_end", "interview_start", "interview_end",
              "lectures_attended", "future_lectures", "personal_pace",
              "readings", "overall_pace", "year", "previous_coding",
              "6.01", "boredom_busters", "no_checkoff", "checkoff3_work",
              "checkoff2_length", "checkoff3_length", "interview_length", "repeat",
              "study", "confidence", "interview_difficulty"]

    # open the file in read mode; open returns a file object
    raw_data = open(path, 'r')
    # Discard the first line (the question headings) by reading it
    # and doing nothing
    raw_data.readline()
    # We can also iterate over the elements (lines) of a file
    # Remember, each line represents a student and is a string.
    poll_responses = []
    for line in raw_data:
        # We want to separate each response. The file is tab-delimited,
        # so we split the string on the tab ('\t') character.
        # The strip method removes whitespace
        line_as_list = line.strip().split('\t')
        # Now, we want to build the dictionary for the responses
        # We'll iterate over the responses for this student (i.e.,
        # line_as_list), and match them up with the corresponding
        # question
        new_dict = {}
        # enumerate is a function that returns the index AND value
        # at the index for each element in an iterable (string,
        # list, tuple)
        for index, answer in enumerate(line_as_list):
            if labels[index] in ["checkoff2_start", "checkoff2_end",
                                 "checkoff3_start", "checkoff3_end",
                                 "interview_start", "interview_end"]:
                # Add another key-value pair that represents the response as a
                # datetime object (you don't need to understand this part)
                # I start by chopping up the response to separate the date
                # and the time
                new_label = labels[index] + "_time"
                day_list = answer.split(" ")
                day = int(day_list[0][2:].strip())
                # I also want to split the time at the colon to make
                # seaparating the hour and minutes easier
                time_list = day_list[1].split(":")
                hour = int(time_list[0])
                minute = int(time_list[1])
                as_time = datetime(2015, 1, day, hour, minute)
                new_dict[new_label] = as_time
            # You should understand this part; not this happens for every answer 
            new_dict[labels[index]] = answer
            
        # I also want to calculate the length of the wait times.
        # You don't need to understand this part. I'm using a method
        # of the datetime objects to calculate the duration for me
        # Subtracting the dates returns another object; I get the
        # number of seconds it represents, then divide by 60 to get
        # the minutes (integer division isn't a big deal here).
        #
        # Question: How could we improve the repetitive nature of this code?
        checkoff2_wait = new_dict["checkoff2_end_time"] - new_dict["checkoff2_start_time"]
        new_dict["checkoff2_wait"] = checkoff2_wait.seconds / 60
        checkoff3_wait = new_dict["checkoff3_end_time"] - new_dict["checkoff3_start_time"]
        new_dict["checkoff3_wait"] = checkoff3_wait.seconds / 60
        interview_wait = new_dict["interview_end_time"] - new_dict["interview_start_time"]
        new_dict["interview_wait"] = interview_wait.seconds / 60
        
        
        # Add this dictionary to our list of responses
        poll_responses.append(new_dict)
    # Remember to close the file and send the data structure back
    # to the rest of the program
    raw_data.close()
    return poll_responses

poll_responses = load_data(path)

def examine_study_effects(poll_responses):
    """
    Given a list poll_responses, prints information about how studying
    affected the length of interview.

    Question: Can we draw strong conclusions from this output?
    """
    # Responses to study: I did not study at all, 1-30 minutes, 31-60 minutes,
    # 61-90 minutes (there were two other possible responses, but I'm leaving
    # them out since nobody used them to simplify the code)
    #
    # Responses for interview_length: 0-5 minutes, 6-10 minutes, 11-15 minutes,
    # 16-20 minutes, More than 20 minutes
    interview_lengths = ["0-5 minutes", "6-10 minutes", "11-15 minutes",
                         "16-20 minutes", "More than 20 minutes"]
    
    study_possibilities = ["I did not study at all", "1-30 minutes", "31-60 minutes",
                           "61-90 minutes"]
    pylab_interview_vals = [5, 10, 15, 16, 20]
    pylab_study_vals = [0, 30, 60, 90]

    # I'm going to count up how may people in each group of study length had
    # interviews of the given lengths
    results = []
    for i in range(4):
        results.append([])
        for j in range(5):
            results[i].append(0)

    # student will take on the value of each dictionary in poll_responses
    for student in poll_responses:
        # find the student's bucket and add 1
        interview_index = interview_lengths.index(student["interview_length"])
        study_index = study_possibilities.index(student["study"])
        results[study_index][interview_index] += 1
    # print the results
    for i in range(4):
        for j in range(5):
            amount = results[i][j]
            percent = round(100*float(amount)/sum(results[i]), 2)
            # the backslash just lets me continue on the next line
            print amount, "or", percent, \
                  "% of students had interviews that took", \
                  interview_lengths[j], "said they studied for", \
                  study_possibilities[i]
        print "------------------------"
    
examine_study_effects(poll_responses)