# -*- coding: utf-8 -*- """ Created on Wed Oct 16 2019 @author: rorymorrison You may need to pip install some of the modules """ import bs4, os, requests import numpy as np import pandas as pd pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) import datetime from datetime import timedelta os.chdir('C:\\Users\\Rory\\.spyder-py3\\running_analysis') # Set up a function to get the data from PowerOf10 def get_table(link, event): res = requests.get(link) soup = bs4.BeautifulSoup(res.content, "html.parser") headings = soup.find("tr", class_="rankinglistheadings") headings = [i.getText() for i in headings] rows = soup.find_all("tr", class_="rlr") + soup.find_all("tr", class_="rlra") print("number of rows found: " + str(len(rows))) rows_text = [] for i in rows: rows_text.append([j.getText() for j in i.find_all("td")]) df = pd.DataFrame(rows_text, columns=headings) df['Name'].replace('', np.nan, inplace=True) df.dropna(subset=['Name'], inplace=True) timestamps = [] for k in range(len(df.Perf)): try: date_str = df.Perf[k] entry = datetime.datetime.strptime(date_str, "%M:%S.%f") entry = timedelta(minutes=entry.minute, seconds=entry.second) timestamps.append(entry) except: timestamps.append("") time = "Time_"+event df[time] = pd.Series(timestamps, index = df.index) df[time] = pd.to_datetime(df[time],errors='coerce') df.dropna(inplace=True) df.sort_values(time) rank = "Rank_" + event df[rank] = pd.to_numeric(df['Rank'], errors='coerce') return df # Get the 5km data link_5k = "https://www.thepowerof10.info/rankings/rankinglist.aspx?event=5000&agegroup=ALL&sex=M&year=2019" table_5k = get_table(link_5k, "5k") # Get the 10km data link_10k = "https://www.thepowerof10.info/rankings/rankinglist.aspx?event=10000&agegroup=ALL&sex=M&year=2019" table_10k = get_table(link_10k, "10k") # Merge the two tables together using the 'Name' column as the reference mutual = pd.merge(table_5k, table_10k, on='Name') # ============================================================================= # Linear regression # ============================================================================= from scipy import stats x = [] for i in mutual.PB_x.tolist(): try: a = int(i[:2]) + int(i[3:5])/60 x.append(a) except: continue y = [] for i in mutual.PB_y.tolist(): try: a = int(i[:2]) + int(i[3:5])/60 y.append(a) except: continue # ============================================================================= # Filter the model to remove 5km times which are less than twice the 10km time # i.e. this time is not normal # ============================================================================= x_filt = [] y_filt = [] for i in range(0, len(x)): if x[i]*2 < y[i]: x_filt.append(x[i]) y_filt.append(y[i]) x = np.array(x_filt) y = np.array(y_filt) slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) x_lr = [] for i in np.linspace(13, 19, 100): x_lr.append(i) y_lr = [(intercept + slope*i) for i in x_lr] df_lr = pd.DataFrame({'x': x_lr, \ 'y': y_lr}) df_lr['x_mins'] = [datetime.timedelta(minutes=i) for i in df_lr['x']] df_lr['y_mins'] = [datetime.timedelta(minutes=i) for i in df_lr['y']] # ============================================================================= # Make up a table to store my times # ============================================================================= my_5k = "18:40.00" my_10k = "39:35.00" my_time = pd.DataFrame({'Name' : ['Rory Morrison'], \ 'Perf_x' : [my_5k], \ 'Perf_y' : [my_10k]}) my_5k = datetime.datetime.strptime(my_5k, "%M:%S.%f") my_5k = timedelta(minutes=my_5k.minute, seconds=my_5k.second) my_time['Time_5k'] = pd.Series([my_5k]) my_time['Time_5k'] = pd.to_datetime(my_time['Time_5k'],errors='coerce') my_10k = datetime.datetime.strptime(my_10k, "%M:%S.%f") my_10k = timedelta(minutes=my_10k.minute, seconds=my_10k.second) my_time['Time_10k'] = pd.Series([my_10k]) my_time['Time_10k'] = pd.to_datetime(my_time['Time_10k'],errors='coerce') # plot the data from bokeh.io import output_file, show from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper from bokeh.palettes import plasma from bokeh.plotting import figure, output_file, show from bokeh.transform import transform list_x = mutual['Time_5k'].tolist() list_y = mutual['Time_10k'].tolist() Time_5k = mutual['Perf_x'].tolist() Time_10k = mutual['Perf_y'].tolist() Rank_5k = mutual['Rank_5k'].tolist() Rank_10k = mutual['Rank_10k'].tolist() desc = mutual['Name'].tolist() source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, \ Rank_5k=Rank_5k, Rank_10k=Rank_10k, \ Time_5k=Time_5k, Time_10k=Time_10k)) my_source = ColumnDataSource(data=dict(desc=my_time['Name'].tolist(), \ x=my_time['Time_5k'].tolist(), \ y=my_time['Time_10k'].tolist(), \ Time_5k=my_time['Perf_x'].tolist(), \ Time_10k=my_time['Perf_y'].tolist())) hover = HoverTool(tooltips=[ ('Name', '@desc'), ('Rank 5k', '@Rank_5k'), ('5k Time', '@Time_5k'), ('Rank 10k', '@Rank_10k'), ('10k Time', '@Time_10k') ]) mapper = LinearColorMapper(palette=plasma(256), low=min(Rank_5k), high=max(Rank_5k)) p = figure(plot_width=600, plot_height=600, tools=[hover, 'pan', 'wheel_zoom'], title="10k vs 5k PBs", \ x_axis_label='5km (minutes)', y_axis_label='10km (minutes)', \ x_axis_type='datetime', y_axis_type='datetime') p.circle('x', 'y', size=10, source=source, fill_color=transform('Rank_5k', mapper), legend='PowerOf10') p.circle('x', 'y', size=10, fill_color='red', source=my_source, legend='My time') p.line(df_lr['x_mins'].tolist(), df_lr['y_mins'].tolist(), line_dash=[4, 4], line_width=4, \ line_color='orange', legend='best fit of PowerOf10') p.ygrid.grid_line_dash = [6, 4] p.xgrid.grid_line_dash = [6, 4] p.legend.location = "top_left" p.legend.click_policy="hide" output_file('test.html') show(p)