import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
%matplotlib inline


df = dd.read_csv("lichess_data.csv", low_memory=False, 
                dtype={'BlackElo': 'float64',
                   'BlackRatingDifference': 'float64',
                    'TimeControl': 'float64',
                   'TimeIncrement': 'float64',
                   'WhiteElo': 'float64',
                   'WhiteRatingDifference': 'float64'})
df.head()


print(*df.columns)

Event Site Date Round White Black Result UtcDateTime WhiteElo BlackElo WhiteRatingDifference BlackRatingDifference ECO OpeningName TimeControl TimeIncrement Termination Move1 Move2 Move3 Move4 Move5 Move6 Move7 Move8 Move9 Move10 Move11 Move12 Move13 Move14 Move15 Move16 Move17 Move18 Move19 Move20 Move21 Move22 Move23 Move24 Move25 Move26 Move27 Move28 Move29 Move30 Move31 Move32 Move33 Move34 Move35 Move36 Move37 Move38 Move39 Move40 Move41 Move42 Move43 Move44 Move45 Move46 Move47 Move48 Move49 Move50 Move51 Move52 Move53 Move54 Move55 Move56 Move57 Move58 Move59 Move60 Move61 Move62 Move63 Move64 Move65 Move66 Move67 Move68 Move69 Move70 Move71 Move72 Move73 Move74 Move75 Move76 Move77 Move78 Move79 Move80 Move81 Move82 Move83 Move84 Move85 Move86 Move87 Move88 Move89 Move90 Move91 Move92 Move93 Move94 Move95 Move96 Move97 Move98 Move99 Move100 Move101 Move102 Move103 Move104 Move105 Move106 Move107 Move108 Move109 Move110 Move111 Move112 Move113 Move114 Move115 Move116 Move117 Move118 Move119 Move120 Move121 Move122 Move123 Move124 Move125 Move126 Move127 Move128 Move129 Move130 Move131 Move132 Move133 Move134 Move135 Move136 Move137 Move138 Move139 Move140 Move141 Move142 Move143 Move144 Move145 Move146 Move147 Move148 Move149 Move150 Move151 Move152 Move153 Move154 Move155 Move156 Move157 Move158 Move159 Move160 Move161 Move162 Move163 Move164 Move165 Move166 Move167 Move168 Move169 Move170 Move171 Move172 Move173 Move174 Move175 Move176 Move177 Move178 Move179 Move180 Move181 Move182 Move183 Move184 Move185 Move186 Move187 Move188 Move189 Move190 Move191 Move192 Move193 Move194 Move195 Move196 Move197 Move198 Move199 Move200


df = df[['Site', 'Result', 'WhiteElo', 'BlackElo', 'WhiteRatingDifference', 
         'OpeningName', 'TimeControl', 'TimeIncrement', 'Termination']].compute() # Convert to pandas df
df.head()


# Removing games where a player cheated
df = df[df.Termination != 'Rules infraction']

# Removing games with missing values
df.dropna(inplace=True)
print(len(df))

5000825


df = df[abs(df.WhiteRatingDifference) <= 300]


df['Result'] = df.Result.replace({'1-0' : 1.0, '0-1' : 0.0, '1/2-1/2' : 0.5, '*': None})
df.dropna(inplace=True)
df.Result.head()

0    1.0
1    0.0
2    1.0
3    0.0
4    1.0
Name: Result, dtype: float64


# Function to convert time controls to categorical
# I multiply by 40 since we assume a game lasts 40 moves
def convert_time_controls(row):
    if (int(row['TimeControl']) + int(row['TimeIncrement'])*40) < 180:
        return "Bullet"
    elif (int(row['TimeControl']) + int(row['TimeIncrement'])*40) < 600:
        return "Blitz"
    else:
        return "Rapid"


print("There were", len(set(df.OpeningName.values)), "unique openings in our data")
# Split into opening and opening line
split_opening = df.OpeningName.str.split(':', n=1, expand=True)
df['Opening'] = split_opening[0]
df['OpeningLine'] = split_opening[1]
df.drop(columns='OpeningName', inplace=True)

print("Now we reduced that to", len(set(df.Opening.values)), "unique openings")

# Make time control data categorical
df['Speed'] = df.apply(lambda row: convert_time_controls(row) , axis=1)
df.drop(columns='TimeControl', inplace=True)
df.drop(columns='TimeIncrement', inplace=True)

df[['Opening', 'OpeningLine', 'Speed']].head()

There were 2901 unique openings in our data
Now we reduced that to 350 unique openings


# Removing games where the opening is unknown
df = df[df.Opening != '?']


# Make 3 subplots
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True, figsize=(10,3))

# Plot violin plots of distribution of ELOs
sns.violinplot(x='WhiteElo', data=df, ax=ax1)
ax1.set_title("Distribution of White ELO Ratings")
ax1.set_xlabel("ELO rating")

sns.violinplot(x='BlackElo', data=df, ax=ax2)
ax2.set_title("Distribution of Black ELO Ratings")
ax2.set_xlabel("ELO rating")

# Violin plot of rating difference
sns.violinplot(x='WhiteRatingDifference', data=df, ax=ax3)
ax3.set_title("Distribution of Rating Difference")
ax3.set_xlabel("Rating difference (relative to white)")

plt.tight_layout() # Helps format the graphs


df = df[abs(df.WhiteRatingDifference) <= 25]


# Calculate percentages of each time control
bullet_games = len(df[df.Speed == "Bullet"]) / len(df)
blitz_games = len(df[df.Speed == "Blitz"]) / len(df)
rapid_games = len(df[df.Speed == "Rapid"]) / len(df)
speeds = [bullet_games, blitz_games, rapid_games]

# Calculate percentages of results
white_wins = len(df[df.Result == 1.0]) / len(df)
black_wins = len(df[df.Result == 0.0]) / len(df)
draws = len(df[df.Result == 0.5]) / len(df)
win_percs = [white_wins, black_wins, draws]

# Plot results in pie charts
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(8,8))
ax1.pie(speeds, labels=["Bullet", "Blitz", "Rapid"], autopct='%1.1f%%', shadow=True, explode = (.03,.03,.03), normalize=False);
ax1.set_title("Game Speeds")

ax2.pie(win_percs, labels=["White", "Black", "Draw"], autopct='%1.1f%%', shadow=True, explode = (.03,.03,.1), normalize=False);
ax2.set_title("Win Percentages");


# 'Result' indicating the total number of points won with that opening (win = 1, draw = 1/2, loss = 0)
openings = df[['Opening', 'Result']].groupby('Opening').sum()
openings['TotalGames'] = df[['Opening', 'Result']].groupby('Opening').size()
openings['WinPerc'] = openings.Result / openings.TotalGames
openings.head()


openings = openings[openings.TotalGames >= 5000]
openings.head()


# Sort by win percentage
top_25 = openings.sort_values('WinPerc', ascending=False)[:25].reset_index()

ax = sns.barplot(x="Opening", y="WinPerc", data=top_25)
plt.xticks(rotation='vertical'); # Make openings readable
ax.set_ylim(.5, .6) # Set y-range to make distinctions clear
ax.set_ylabel("Win Percentage")
ax.set_title("Best Openings by Win Percentage");


ax = sns.barplot(x="Opening", y="TotalGames", data=top_25)
plt.xticks(rotation='vertical');
ax.set_ylabel("Number of Games Played")
ax.set_title("Frequency of Best Openings by Win Percentage");


openings['Strength'] = np.log(openings.TotalGames) * openings.WinPerc

# Get the top 25 on this measure
top_25 = openings.sort_values('Strength', ascending=False)[:25].reset_index()

ax = sns.barplot(x="Opening", y="Strength", data=top_25)
plt.xticks(rotation='vertical');
ax.set_ylim(5.5,)
ax.set_ylabel("Strength")
ax.set_title("Best Openings by Strength");


# Top 10 most popular openings
top_10_pop = openings.sort_values('TotalGames', ascending=False)[:10].reset_index()

ax = sns.barplot(x="Opening", y="WinPerc", data=top_10_pop)
ax.tick_params(axis='x', labelrotation=45)
ax.set_ylim(.45,)
ax.set_ylabel("Total Games")
ax.set_title("Most Popular Openings");


# Top 10 openings by time control
def top_by_speed(df, speed):
    sub = df[df.Speed == speed] 
    openings = sub[['Opening', 'Result']].groupby('Opening').sum()
    openings['TotalGames'] = sub[['Opening', 'Result']].groupby('Opening').size()
    openings['WinPerc'] = openings.Result / openings.TotalGames
    return openings.sort_values('TotalGames', ascending=False)[:10].reset_index()


# Calculate top 10 at each time control
top_bullet = top_by_speed(df, "Bullet")
top_blitz = top_by_speed(df, "Blitz")
top_rapid = top_by_speed(df, "Rapid")

# Make 3 subplots to plot out top 10 most popular in each time control
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(12,15))

g1 = sns.barplot(x='Opening', y='WinPerc', data=top_bullet, ax=ax1)
ax1.set_title("Top Bullet Openings")
ax1.set_xlabel("Opening", va="bottom")
ax1.set_ylabel("Win Percentage")
ax1.tick_params(axis='x', labelrotation=45)
ax1.set_ylim(.45,)
# Add text displaying height
for index, row in top_bullet.iterrows():
    g1.text(row.name,row.WinPerc, round(row.WinPerc,3), color='black', ha="center")
    
g2 = sns.barplot(x='Opening', y='WinPerc', data=top_blitz, ax=ax2)
ax2.set_title("Top Blitz Openings")
ax2.set_xlabel("Opening")
ax2.set_ylabel("Win Percentage")
ax2.tick_params(axis='x', labelrotation=45)
ax2.set_ylim(.45,)
for index, row in top_blitz.iterrows():
    g2.text(row.name,row.WinPerc, round(row.WinPerc,3), color='black', ha="center")

g3 = sns.barplot(x='Opening', y='WinPerc', data=top_rapid, ax=ax3)
ax3.set_title("Top Rapid Openings")
ax3.set_xlabel("Opening")
ax3.set_ylabel("Win Percentage")
ax3.tick_params(axis='x', labelrotation=45)
ax3.set_ylim(.45,)
for index, row in top_rapid.iterrows():
    g3.text(row.name,row.WinPerc, round(row.WinPerc,3), color='black', ha="center")

plt.tight_layout() # Helps format the graphs


# Calculate average elo for each game 
df['avgElo'] = (df.WhiteElo + df.BlackElo)/2
ratings_df = df.sort_values('avgElo')
ratings_df.head()


# Plot the top 10 most popular openings by rating level
def top_by_rating(low, ax):
    # Clear the figure if we're done
    if low == 2800:
        plt.clf()
        return
    # Range of 100 points of elo
    high = low + 100
    
    # Subset data to be in range
    ret = df[df.avgElo >= low]
    ret = ret[ret.avgElo < high]
    
    # Calculate necessary statistics
    temp = ret[['Opening', 'Result']].groupby('Opening').sum()
    temp['TotalGames'] = ret[['Opening', 'Result']].groupby('Opening').size()
    temp['WinPerc'] = temp.Result / temp.TotalGames
    temp = temp.sort_values('TotalGames', ascending=False)[:10].tail(10).reset_index()
    data = temp.iloc[::-1] # Reverse the order of the data
    ax.clear() # Clear axis from last time
    
    # Draw a horizontal barplot of most popular opening by rating, with the color 
    # being defined by the opening
    ax.barh(data.Opening, data.TotalGames, color=[colors[x] for x in data.Opening]);
    dx = data.TotalGames.max() / 200
    
    # Add labels + format
    for i, (value, name) in enumerate(zip(data.TotalGames, data.Opening)):
        ax.text(value-dx, i,     name,           size=14, weight=600, ha='right', va='bottom')
        ax.text(value+dx, i,     f'{value:,.0f}',  size=14, ha='left',  va='center')
    ax.text(1, 0.2, "Ratings: "+str(low) + "-"+ str(high), transform=ax.transAxes, color='#777777', size=36, ha='right', weight=800);
    ax.text(0, 1.06, 'Number of Game Played', transform=ax.transAxes, size=12, color='#777777');
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'The Most Popular Openings by ELO',
            transform=ax.transAxes, size=24, weight=600, ha='left');
    plt.box(False)


import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML

# Defines which opening goes to which color
colors = dict(zip(
        ["King's Pawn Game", "Queen's Pawn Game", "Van't Kruijs Opening", "Scandinavian Defense", 
        "Italian Game", "Gedult's Opening", "Russian Game", "French Defense", "English Opening", 
        "Sicilian Defense", "Bishop's Opening", "Zukertort Opening", "Four Knights Game", 
        "Ruy Lopez", "Scotch Game", "Caro-Kann Defense", "Modern Defense", "Queen's Gambit Declined",
        "Indian Game", "Hungarian Opening", "Nimzo-Larsen Attack", "King's Indian Attack"],
        ['b', 'g', 'r', 'c', 'm', 'y', '#232b2b','#FFA500','#800080','#52899A', "#F2795F",
        "#696969", "#CD9DD4", "#F90E9F", "#B3AE0D", "#35D5B9", "#49B303", "#5A65BD", "#A30C6C",
        "#C46D10", "#CEB8A1", "#0AF95C"]
        ))

# Plot the animated plot
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, top_by_rating, frames=np.arange(600, 2801, 100), fargs=(ax,), interval=500)
HTML(animator.to_jshtml())

<Figure size 1080x576 with 0 Axes>


# Updated function from before to work for Sicilian opening
def top_by_speed_sicilian(df, speed):
    sub = df[df.Opening == "Sicilian Defense"]
    sub = ret[ret.Speed == speed]
    openings = sub[['OpeningLine', 'Result']].groupby('OpeningLine').sum()
    openings['TotalGames'] = sub[['OpeningLine', 'Result']].groupby('OpeningLine').size()
    openings['WinPerc'] = openings.Result / openings.TotalGames
    return openings.sort_values('TotalGames', ascending=False)[:10].reset_index()


# Same as before, just with analyzing lines of the Sicilian Defense
top_bullet = top_by_speed_sicilian(df, "Bullet")
top_blitz = top_by_speed_sicilian(df, "Blitz")
top_rapid = top_by_speed_sicilian(df, "Rapid")

fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(12,15))

g1 = sns.barplot(x='OpeningLine', y='WinPerc', data=top_bullet, ax=ax1)
ax1.set_title("Top Bullet Openings")
ax1.set_xlabel("Opening", va="bottom")
ax1.set_ylabel("Count")
ax1.tick_params(axis='x', labelrotation=45)
ax1.set_ylim(.45,)
for index, row in top_bullet.iterrows():
    g1.text(row.name,row.WinPerc, round(row.WinPerc,3), color='black', ha="center")
    
g2 = sns.barplot(x='OpeningLine', y='WinPerc', data=top_blitz, ax=ax2)
ax2.set_title("Top Blitz Openings")
ax2.set_xlabel("Opening")
ax2.set_ylabel("Count")
ax2.tick_params(axis='x', labelrotation=45)
ax2.set_ylim(.45,)
for index, row in top_blitz.iterrows():
    g2.text(row.name,row.WinPerc, round(row.WinPerc,3), color='black', ha="center")

g3 = sns.barplot(x='OpeningLine', y='WinPerc', data=top_rapid, ax=ax3)
ax3.set_title("Top Rapid Openings")
ax3.set_xlabel("Opening")
ax3.set_ylabel("Count")
ax3.tick_params(axis='x', labelrotation=45)
ax3.set_ylim(.45,)
for index, row in top_rapid.iterrows():
    g3.text(row.name,row.WinPerc, round(row.WinPerc,3), color='black', ha="center")

plt.tight_layout() # Helps format the graphs

	Event	Site	Date	Round	White	Black	Result	UtcDateTime	WhiteElo	BlackElo	...	Move191	Move192	Move193	Move194	Move195	Move196	Move197	Move198	Move199	Move200
0	Rated Rapid tournament https://lichess.org/tou...	https://lichess.org/bHud36o4	11/1/2019	-	e_0shams0	foxfless	1-0	11/1/2019 12:00:03 AM	1504.0	1505.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Rated Rapid tournament https://lichess.org/tou...	https://lichess.org/rKE2g6yF	11/1/2019	-	Andrej1909	ewafx	0-1	11/1/2019 12:00:03 AM	1487.0	1485.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Rated Rapid tournament https://lichess.org/tou...	https://lichess.org/WT12ZLCv	11/1/2019	-	Lassandra	Kaival145	1-0	11/1/2019 12:00:03 AM	1516.0	1523.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Rated Blitz tournament https://lichess.org/tou...	https://lichess.org/dMdEbs4b	11/1/2019	-	Ayowole	vallar_morghullis	0-1	11/1/2019 12:00:03 AM	1682.0	1666.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Rated Blitz tournament https://lichess.org/tou...	https://lichess.org/VBCLUNgj	11/1/2019	-	sochparov	HishamElalawy	1-0	11/1/2019 12:00:03 AM	1765.0	1766.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Site	Result	WhiteElo	BlackElo	WhiteRatingDifference	OpeningName	TimeControl	Termination
0	https://lichess.org/bHud36o4	1-0	1504.0	1505.0	7.0	Van't Kruijs Opening	600.0	Normal
1	https://lichess.org/rKE2g6yF	0-1	1487.0	1485.0	-7.0	Carr Defense	600.0	Normal
2	https://lichess.org/WT12ZLCv	1-0	1516.0	1523.0	13.0	Nimzo-Indian Defense: Classical Variation, Noa...	600.0	Normal
3	https://lichess.org/dMdEbs4b	0-1	1682.0	1666.0	-8.0	Ruy Lopez: Morphy Defense, Neo-Arkhangelsk Var...	300.0	Time forfeit
4	https://lichess.org/VBCLUNgj	1-0	1765.0	1766.0	7.0	Italian Game: Giuoco Pianissimo	300.0	Normal

	Result	TotalGames	WinPerc
Opening
Alekhine Defense	26865.0	53780	0.499535
Alekhine Defense #2	1470.0	2710	0.542435
Alekhine Defense #3	215.0	491	0.437882
Amar Gambit	2.0	2	1.000000
Amar Opening	668.5	1389	0.481281

	Result	TotalGames	WinPerc
Opening
Alekhine Defense	26865.0	53780	0.499535
Benoni Defense	13368.0	26794	0.498918
Bird Opening	25345.0	49466	0.512372
Bishop's Opening	42858.0	78595	0.545302
Blackmar-Diemer Gambit	8533.0	16543	0.515807

	Site	Result	WhiteElo	BlackElo	WhiteRatingDifference	Termination	Opening	OpeningLine	Speed	avgElo
31374	https://lichess.org/4ybVbZwe	1.0	605.0	606.0	8.0	Normal	Russian Game	Urusov Gambit	Blitz	605.5
105966	https://lichess.org/poOXd2BT	1.0	613.0	600.0	18.0	Normal	Italian Game	None	Bullet	606.5
30771	https://lichess.org/HzXYRKwo	0.0	602.0	619.0	-2.0	Normal	Gedult's Opening	None	Blitz	610.5
106675	https://lichess.org/1Z4SOSD9	0.0	600.0	631.0	0.0	Time forfeit	Queen's Pawn Game	Chigorin Variation	Bullet	615.5
70438	https://lichess.org/ZwcWEvYK	1.0	605.0	627.0	11.0	Time forfeit	Grob Opening	None	Blitz	616.0

An Analysis of Chess Openings

Tutorial by Ben Moskowitz

Introduction¶

Setup¶

Data Collection¶

Data Cleaning¶

Missing Data¶

Large Rating Difference¶

Data Type Conversion¶

Formatting Changes¶

Adding a Column Indicating Color¶

Exploratory Analysis¶

Distribution of Data¶

"Best" Openings¶

Best Openings By WinPerc*Freq¶

Reavaluation of the Question¶

Updated Question¶

Most Popular Openings¶

Openings Popularity in Different Time Formats¶

Opening Popularity Across Different Ratings¶

In Depth Exporation of an Opening¶

Conclusion + Further Work¶

Additional Resources¶

	Opening	OpeningLine	Speed
0	Van't Kruijs Opening	None	Rapid
1	Carr Defense	None	Rapid
2	Nimzo-Indian Defense	Classical Variation, Noa Variation	Rapid
3	Ruy Lopez	Morphy Defense, Neo-Arkhangelsk Variation	Blitz
4	Italian Game	Giuoco Pianissimo	Blitz