# author: Michelle Szucs (mszucs) import string from datetime import * import pylab path = "raw_data.tsv" """ Question: We want to make some plots - or at least print out information - about the responses to the student poll. How can we break this problem up into smaller subunits? What kinds of tasks do we need to perform to analyze the data? """ def load_data(path): """ Question: How should we model this data? We have a series of questions, for which we have multiple responses. What are different ways we can model the data? """ """ I chose to make a list of dictionaries. Each element of the list (i.e., each dictionary) represents a different student's response. Each key in the dictionary is a string corresponding to a different question. question_mapping.txt lays out what the questions are called. The value associated with the key is the student's response to that question. """ # Start by defining the question labels in the order they're saved # in the file labels = ["timestamp", "checkoff2_start", "checkoff2_end", "checkoff3_start", "checkoff3_end", "interview_start", "interview_end", "lectures_attended", "future_lectures", "personal_pace", "readings", "overall_pace", "year", "previous_coding", "6.01", "boredom_busters", "no_checkoff", "checkoff3_work", "checkoff2_length", "checkoff3_length", "interview_length", "repeat", "study", "confidence", "interview_difficulty"] # open the file in read mode; open returns a file object raw_data = open(path, 'r') # Discard the first line (the question headings) by reading it # and doing nothing raw_data.readline() # We can also iterate over the elements (lines) of a file # Remember, each line represents a student and is a string. poll_responses = [] for line in raw_data: # We want to separate each response. The file is tab-delimited, # so we split the string on the tab ('\t') character. # The strip method removes whitespace line_as_list = line.strip().split('\t') # Now, we want to build the dictionary for the responses # We'll iterate over the responses for this student (i.e., # line_as_list), and match them up with the corresponding # question new_dict = {} # enumerate is a function that returns the index AND value # at the index for each element in an iterable (string, # list, tuple) for index, answer in enumerate(line_as_list): if labels[index] in ["checkoff2_start", "checkoff2_end", "checkoff3_start", "checkoff3_end", "interview_start", "interview_end"]: # Add another key-value pair that represents the response as a # datetime object (you don't need to understand this part) # I start by chopping up the response to separate the date # and the time new_label = labels[index] + "_time" day_list = answer.split(" ") day = int(day_list[0][2:].strip()) # I also want to split the time at the colon to make # seaparating the hour and minutes easier time_list = day_list[1].split(":") hour = int(time_list[0]) minute = int(time_list[1]) as_time = datetime(2015, 1, day, hour, minute) new_dict[new_label] = as_time # You should understand this part; not this happens for every answer new_dict[labels[index]] = answer # I also want to calculate the length of the wait times. # You don't need to understand this part. I'm using a method # of the datetime objects to calculate the duration for me # Subtracting the dates returns another object; I get the # number of seconds it represents, then divide by 60 to get # the minutes (integer division isn't a big deal here). # # Question: How could we improve the repetitive nature of this code? checkoff2_wait = new_dict["checkoff2_end_time"] - new_dict["checkoff2_start_time"] new_dict["checkoff2_wait"] = checkoff2_wait.seconds / 60 checkoff3_wait = new_dict["checkoff3_end_time"] - new_dict["checkoff3_start_time"] new_dict["checkoff3_wait"] = checkoff3_wait.seconds / 60 interview_wait = new_dict["interview_end_time"] - new_dict["interview_start_time"] new_dict["interview_wait"] = interview_wait.seconds / 60 # Add this dictionary to our list of responses poll_responses.append(new_dict) # Remember to close the file and send the data structure back # to the rest of the program raw_data.close() return poll_responses poll_responses = load_data(path) def examine_study_effects(poll_responses): """ Given a list poll_responses, prints information about how studying affected the length of interview. Question: Can we draw strong conclusions from this output? """ # Responses to study: I did not study at all, 1-30 minutes, 31-60 minutes, # 61-90 minutes (there were two other possible responses, but I'm leaving # them out since nobody used them to simplify the code) # # Responses for interview_length: 0-5 minutes, 6-10 minutes, 11-15 minutes, # 16-20 minutes, More than 20 minutes interview_lengths = ["0-5 minutes", "6-10 minutes", "11-15 minutes", "16-20 minutes", "More than 20 minutes"] study_possibilities = ["I did not study at all", "1-30 minutes", "31-60 minutes", "61-90 minutes"] pylab_interview_vals = [5, 10, 15, 16, 20] pylab_study_vals = [0, 30, 60, 90] # I'm going to count up how may people in each group of study length had # interviews of the given lengths results = [] for i in range(4): results.append([]) for j in range(5): results[i].append(0) # student will take on the value of each dictionary in poll_responses for student in poll_responses: # find the student's bucket and add 1 interview_index = interview_lengths.index(student["interview_length"]) study_index = study_possibilities.index(student["study"]) results[study_index][interview_index] += 1 # print the results for i in range(4): for j in range(5): amount = results[i][j] percent = round(100*float(amount)/sum(results[i]), 2) # the backslash just lets me continue on the next line print amount, "or", percent, \ "% of students had interviews that took", \ interview_lengths[j], "said they studied for", \ study_possibilities[i] print "------------------------" examine_study_effects(poll_responses)