In [1]:
from aflutil import helper

In [2]:
!cat aflutil/helper.py

import sys
from collections import OrderedDict
import os 
from pathlib import Path

'''
The column header format you want to produce as the
main output. This is a ordered dictionary where the lookup 
value can be used to pull data from a two dimensional
array of all data later.
'''
teams_header = OrderedDict(
    Team=0,
    Year=1,
    Round=2,
    Where=3,
    Opponent=4,
    For_Scoring=5,
    For_Total=6, 
    Against_Scoring=7,
    Against_Total=8, 
    Result=9, 
    Margin=10,
    WDL=11,
    Venue=12,
    Crowd=13,
    Date=14
)

'''
Create Results Directory if missing.
'''
def create_results_dir(output_dir):
  try:
    if not os.path.exists(output_dir):
      Path(output_dir).mkdir(parents=True,exist_ok=True)
  except Exception as e:
    eprint ("[ERROR] Could not create output directory.")
    eprint ("[ERROR] with exception: {}".format(str(e)))

'''
Zero error log.
'''
def zero_error_log(error_log):
  with open(error_log,'w') as err_file:
    err_file.write('')

'''
Error pr

In [3]:
helper.print_header()

Team	Year	Round	Where	Opponent	For_Scoring	For_Total	Against_Scoring	Against_Total	Result	Margin	WDL	Venue	Crowd	Date


# Challenge 1 Function Template

Main function to implement. To get it working, you need open in input file in input_files and walk each line by line. The text files are cleanly formatted markdown with all data using the '|' separator. So, each line if not empty can be split into fields using the split function. You can keep the data clean by using the strip
operator to remove trailing and leading spaces in lines and fields. Your goal is simple -- build a two dimensional array (list of lists) where every line adheres to the teams_header, which is 14 columns of data. You can then convert this to a pandas dataframe or easily compute and validate simple statistical data by walking each rown in the list as you will know what information it should contain.

In [4]:
def process_teams(input_files):
    '''The main data array. '''
    data_array = []

    ''' TODO by you '''
#     data_array.append(['Team', 'Year', 'Round', 'Where', 'Opponent',
#                        'For_Scoring', 'For_Total', 'Against_Scoring',
#                        'Against_Total', 'Result', 'Margin', 'WDL',
#                        'Venue', 'Crowd', 'Date'])
    for fileName in input_files:
        with open(fileName, 'r') as dataFile:
            teamName = ''
            year = 0
            data = []
            while True:
                line = dataFile.readline()  # eat the white space
                if line == '\n':
                    break
                if line.find('---') > 0:  # eat the line '| --- |'
                    if teamName == '':
                        line = dataFile.readline()
                        teamName = line.split('|')[1]
                        continue
                    elif year == 0:
                        line = dataFile.readline()
                        year = line.split('|')[1]
                        continue
                    continue
                # process the data
                data = line.split('|')[1:14]
                if data[0].replace(' ', '') == 'Rnd':
                    continue
                if data[0].replace(' ', '') == 'Totals' or data[0].replace(' ', '') == 'Averages':
                    year = 0
                    continue
                data.insert(0, year)
                data.insert(0, teamName)
                data_array.append(data)
    return data_array

# Main Function Calls for Challenge 1

This is the main function. You should not modify this unless you want to print the header in testing. Make sure to comment it out in your submission or you'll fail the test harness.

In [5]:
output_dir = 'results/'
error_log = 'results/error.log'
teams_file = 'data/teams.in'
array_file = 'results/array-initial.tsv'

helper.create_results_dir(output_dir)
helper.zero_error_log(error_log)
teams_files = helper.read_input_file_names(teams_file)
#print (teams_file)
data_array = process_teams(teams_files)
#helper.print_header()
helper.print_array(data_array,array_file)

# Challenge  2 Helper Function

A function to compute a final scored based on the traditional per quarter score format used by the AFL. Columns 7 and 5 will contain something like '1.0 1.4 4.5 5.8'. These map to the four quarters and are cumulative. So '5.8' means the team scored 5 goals worth 6 points each and had 8 behinds for 1 point each, which should sum up to 38. So the function should simply take a scoring row and return the final score for that field. This is not as easy as it looks. You need to split on spaces and then again on '.'. Once You have the goals and behinds from the string, you need to cast them to an integer to do any computations with them.

In [6]:
def compute_score(S):
    result = 0

    '''
    TODO by you. Return the correct score as described
    above.
    '''
    for quarter in S.split(' ')[1:5]:
        gole = quarter.split('.')[0]
        behinds = quarter.split('.')[1]
        result = result + int(gole) * 6 + int(behinds)
    return result

# Challenge 2 Validation Function

Once you can compute a score using a scoring field in the helper function above, you can validate every score, win/loss/tie, and score difference in the raw data array. The function should take the data array, and validate columns 6,8,10,11 of every row, which correspond to: For Final Score, Against Final Score, Result, and Margin respectively. If a row is incorrect, you should correct it. So the function accepts the original data_array and returns a modified data array that should have 100% of the scores, margins, and outcomes correct.

In [7]:
def validate_all_scores(data_array):

    '''
    TODO by you. The input is the original data_array and
    the return value is the corrected data array.
    '''
    for rowData in data_array:
        # calculate For_total
        F_score = rowData[5]
        F_total = compute_score(F_score)
        # calculate Against_total
        A_score = rowData[7]
        A_total = compute_score(A_score)
        # margin
        margin = F_total - A_total
        # result
        if margin > 0:
            result = 'W'
        elif margin < 0:
            result = 'L'
        else:
            result = 'D'
        # update data
        rowData[6] = str(F_total)
        rowData[8] = str(A_total)
        rowData[9] = result
        rowData[10] = str(margin)
    return data_array

# Challenge 2 main function calls.

The main fuction calls. These should not be modified in your final version so be careful if you change anything here.

In [8]:
error_log = 'results/error.log'
input_array_file = 'results/array-initial.tsv'
output_array_file = 'results/array-updated.tsv'

#helper.create_results_dir(output_dir)
helper.zero_error_log(error_log)
data_array = helper.load_tsv_file(input_array_file)
data_array = validate_all_scores(data_array)
helper.print_array(data_array,output_array_file)

In [9]:
from aflutil import AFLGame
import heapq

In [10]:
!cat aflutil/AFLGame.py


''' 
This class is used only for the final challenge problem where
you must find the five biggest home wins and losses.
'''    
class AFLGame:

  def __init__(self, team, opponent, year, difference, totalfor, totalagainst):
    self.team = team
    self.opponent = opponent
    self.year = year
    self.difference = difference
    self.totalfor = totalfor
    self.totalagainst = totalagainst

  def __str__(self):
    return self.team + '-' + self.opponent + ' (' + self.year + ') : '  + self.totalfor + '-' + self.totalagainst + '\n'



# Challenge 3 Function

The final challenge is to walk the corrected data and return the five biggest home wins and losses in the dataset. Use the AFLGames class for this one. Create a list of games with their scores. Once you have the list, you can apply the function heapq.nlargest(5,list,key=sortkey) to order them to find final result. An example case that should give you a good idea of how to solve this will be presented in the Lectorials when we discuss the Assignment.

In [11]:
def find_biggest_wins(data_array, where):

    topfive = []

    ''' TODO by you. heapq.nlargest is the best solution hint I can
      give you.
    '''
    data_array = list(filter(lambda x: x[3].replace(' ', '') == where, data_array))
    topfive = heapq.nlargest(5, data_array, key=lambda item: int(item[10]))

    return topfive

# Challenge 3 Main Function

The main fuction calls. These should not be modified in your final version so be careful if you change anything here.

In [12]:
input_array = 'results/array-updated.tsv'
output_file = 'results/bigwins.txt'

data_array = helper.load_tsv_file(input_array)
home_wins = find_biggest_wins (data_array,'H')
away_wins = find_biggest_wins (data_array,'A')
helper.write_top_five (home_wins, away_wins,output_file)