import pandas as pd
import json
import requests
import matplotlib.pyplot as plt
import re
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
import statistics


URL = "https://data.scryfall.io/default-cards/default-cards-20221212220657.json"
full_scryfall_df = pd.DataFrame(json.loads(requests.get(URL).text))
full_scryfall_df.head()


df = full_scryfall_df[['name',                       # the name of the card - not technically necessary but helpful for debugging
                       'mana_cost',                  # what type of mana the card costs to summon
                       'cmc',                        # how much mana the card costs
                       'type_line',                  # the type of the card (creature, sorcery, etc)
                       'oracle_text',                # what the card does
                       'power', 'toughness',         # the strength of the card if it's a creature
                       'colors', 'color_identity',   # more info on what type of mana the card costs
                       'keywords',                   # the keywords on the card (more on this later)
                       'set', 'released_at',         # when the card was released
                       'rarity',                     # how much the card was printed
                       'games',                      # games tells if it is legal online or in paper (we exclude online-only cards)
                       'legalities']]                # which formats the card is legal in


df = df[df['games'].apply(lambda i: 'paper' in i)]


df = df.sort_values(by=['released_at', 'name'])
df = df.drop_duplicates(subset=['name'])


def legal(legalities):
    v = legalities.values()
    if len(set(v)) == 1 and "not_legal" in v:
        return False
    return True

df = df[df['legalities'].apply(legal)]
df = df[~df["type_line"].str.contains("Token", na=False)] # remove tokens


unsets = ['unglued', 'unhinged', 'unstable', 'unsanctioned', 'unfinity']
sets = json.loads(requests.get("https://api.scryfall.com/sets").text)
for s in sets["data"]:
    if s['name'].lower() in unsets:
        df = df[~df["set"].str.contains(s['code'])]


df = df.dropna() # drop NAN values

alpha_release_date = dt.datetime(1993, 8, 5)
df['released_at'] = df['released_at'].apply(lambda i: (dt.datetime.strptime(i, '%Y-%m-%d').year - alpha_release_date.year))


df["num_colored_pips"] = df["mana_cost"].apply(lambda mana_cost: len(re.findall("\{[^\d]\}", str(mana_cost))))
df["num_colors"] = df["colors"].apply(len)


df = df[df["type_line"].str.contains("Creature", na=False)]

def make_int(i):
    try:
        return int(i)
    except ValueError:
        return np.nan

for i in ['cmc', 'power', 'toughness']:
    df[i] = df[i].apply(make_int).astype('Int64')#
df = df.dropna()


order = ['White', 'Blue', 'Black', 'Red', 'Green']
plt.pie([len(df[df['colors'].apply(lambda x: c[0] in x)].index) for c in order], 
        labels=order, colors=order,
        wedgeprops={"edgecolor":"k",'linewidth': 2}) # to draw a border for the white creatures, otherwise invisible
plt.axis('equal') 
plt.title("Creatures by Color")
plt.show()


r = range(max(df['released_at']))
plt.plot(r, [statistics.mean(df[df['released_at'] == i]['oracle_text'].apply(len)) for i in r], color='purple')
plt.xlabel("Years since Alpha")
plt.ylabel("Average number of Words")
plt.title("Complexity Creep over Time")
plt.show()


df.head()


df[df["name"].str.contains("Grizzly Bears")]


df[df["name"].str.contains("Coral Eel")]


vanilla_df = df[df["oracle_text"] == ""]
vanilla_df.head()


plt.pie([len(vanilla_df.index), len(df.index)], 
        explode=(0.1, 0), 
        labels=['Vanilla Creatures', 'Other Creatures'], 
        colors=['red', 'purple'],
        startangle=90)
plt.axis('equal') 
plt.show()


r = range(max(df['released_at']))
plt.plot(r, [len(vanilla_df[vanilla_df['released_at'] == i]['oracle_text'].index) for i in r], color='red')
plt.xlabel("Years since Alpha")
plt.ylabel("Number of Vanilla Creatures")
plt.title("Vanilla Creatures over Time")
plt.show()


mp, mt = max(vanilla_df['power']) + 1, max(vanilla_df['toughness']) + 1
data = np.zeros((mp, mt))

fig, ax = plt.subplots()

for power in range(mp):
    for toughness in range(mt):
        count = len(vanilla_df[(vanilla_df['power'] == power) & (vanilla_df['toughness'] == toughness)]['cmc'].values)
        data[mp - power - 1][toughness] = count
        
        text = ax.text(toughness, mp - power - 1, count,
                       ha="center", va="center", color="w")

ax.set_yticks(np.arange(mp), labels=list(range(mp))[::-1]) # invert power so that 0/0 is the bottom left corner

im = ax.imshow(data)
fig.tight_layout()
plt.xlabel("Toughness")
plt.ylabel("Power")
plt.title("Power vs Toughness of Vanilla Creatures")
plt.show()


plt.bar(['Common', 'Uncommon', 'Rare'], vanilla_df['rarity'].value_counts(), color=['black', 'silver', 'gold'])
plt.title("Vanilla Rarity Distribution")
plt.xlabel("Rarity")
plt.ylabel("Number")
plt.show()


data_df = vanilla_df.drop(columns=["mana_cost", "type_line", "oracle_text", "color_identity", "keywords", "set", "games", "legalities"])
data_df.head()


def dummy_list(data_df, one_hot_df, column, predicate=lambda i, j: i == j):
    x = set(data_df.explode(column)[column].values) # get all values from the column
    if np.nan in x: # remove NaNs that might be in there
        x.remove(np.nan)
    
    for i in x: # make the new one-hot column
        one_hot_df[f'{column}_{i}'] = data_df[column].apply(lambda j: int(predicate(i, j)))

one_hot_df = data_df.drop(columns=['colors', 'rarity'])
dummy_list(data_df, one_hot_df, 'colors', predicate=lambda i, j: i in j)
dummy_list(data_df, one_hot_df, 'rarity')
one_hot_df.head()


X = one_hot_df.drop(columns=['name', 'cmc'])
y = one_hot_df['cmc']


X = sm.add_constant(X)


# Run statsmodel's OLS (Generalized Least Squares) model, making sure it's on integers
model = sm.OLS(np.asarray(y, dtype=int), 
               np.asarray(X, dtype=int)).fit()

# Plot the coefficients, with error bars
plt.errorbar(model.params, X.columns, xerr=model.bse, fmt='o',
             alpha=0.5, ecolor='grey', capsize=5)
plt.xlabel("Coefficient")
plt.ylabel("Variable")
plt.title("Linear Regression for Converted Mana Cost of Vanilla Creatures")
plt.show()


def expected_mana_cost(row):
    return sum(row * model.params[1:]) + model.params[0]

one_hot_df['expected_mana_value'] = one_hot_df.drop(columns=['name', 'cmc']).apply(expected_mana_cost, axis=1)
one_hot_df


keywords = set()
for i, row in df.iterrows():
    if row['set'] not in ['afr', '40k', 'clb', 'sld']:
        # normal set
        keywords = keywords.union(row['keywords'])
    else:
        # ignore flavor words by ignoring keywords with spaces
        keywords = keywords.union([kw for kw in row['keywords'] if ' ' not in kw]) 
        
keyword_soup = ' '.join(keywords).lower() # easy way to convert everything to lowercase


removes = [r'(\(.*?\))', # remove parentheses (reminder text)
           r'(\{.*?\})', # remove mana costs for certain abilities (outlast, etc.)
           r'(—[^ ][^\n]*)', # remove ward costs
           r'[Pp]rotection(?! F)([^\n]*)', # remove protection type
           r'(\d*)', # remove numbers for certain abilities (rampage, etc.)
           r'Prototype([^\n]*)'] # remove prototype costs

def extract_ability_text(row):
    text = row['oracle_text']

    if text is np.nan or text == '':
        return ''
    
    for r in removes:
        if m := re.search(r, text):
            start, end = m.span(1) # remove first capturing group
            text = text[:start] + text[end:]
        
    text = text.replace(',', '').replace(';', '')
    text = text.lower()
    text = text.strip()

    return text

df['ability_text'] = df.apply(extract_ability_text, axis=1)


def is_french_vanilla(row):
    text = row['ability_text']

    for i in text.split():
        if i not in keyword_soup: # check that every word is a kewyord
                                  # note that keywords are never in front of a period or any punctuation other 
                                  # than , or ; so we don't need to do any complicated tokenization
            return False

    return True
df['is_french_vanilla'] = df.apply(is_french_vanilla, axis=1)


french_vanilla_df = df[df['is_french_vanilla']]
french_vanilla_df.head()


plt.bar(['Common', 'Uncommon', 'Rare', 'Mythic'], 
        french_vanilla_df['rarity'].value_counts(), 
        color=['black', 'silver', 'gold', 'red'])
plt.title("French Vanilla Rarity Distribution")
plt.xlabel("Rarity")
plt.ylabel("Number")
plt.show()


fv_data_df = french_vanilla_df.drop(columns=['mana_cost', 'type_line', 'color_identity', 'set', 'is_french_vanilla', 'games', 'legalities'])
fv_one_hot_df = fv_data_df.drop(columns=['colors', 'oracle_text', 'keywords', 'rarity'])
dummy_list(fv_data_df, fv_one_hot_df, 'colors', predicate=lambda i, j: i in j)
dummy_list(fv_data_df, fv_one_hot_df, 'rarity')
fv_one_hot_df.head()


keyword_counts = {}

for keyword in keywords:
    counts = fv_data_df.apply(lambda row: row['ability_text'].count(keyword.lower()), axis=1)
    if sum(counts) > 0: # some keywords never appear on french vanilla creatures
        keyword_counts[f'keywords_{keyword}'] = sum(counts)
        fv_one_hot_df[f'keywords_{keyword}'] = counts


X = fv_one_hot_df.drop(columns=['name', 'cmc', 'ability_text'])
X = sm.add_constant(X)
y = fv_one_hot_df['cmc']

model = sm.OLS(np.asarray(y, dtype=int), 
               np.asarray(X, dtype=int)).fit()


indices = []
keyword_indices = []
per_year = 0
for i, name in enumerate(X.columns):
    if name not in keyword_counts:
        indices.append(i)
        if name == 'released_at':
            per_year = model.params[i] # for later
    elif any(i in name for i in ['Haste', 'Defender', 'Double strike', 'Flying', 'Trample', 'Cascade', 'Lifelink', 'Hexproof', 'Shroud', 'Fear', 'Intimidate', 'Delve', 'Convoke', 'Undying', 'Flash', 'Vigilance', 'Echo']):
        keyword_indices.append([i, model.params[i]])
keyword_indices.sort(key=lambda i: i[1])
keyword_indices = [i[0] for i in keyword_indices]


plt.errorbar([model.params[i] for i in indices], 
             [X.columns[i] for i in indices], 
             xerr=[model.bse[i] for i in indices], 
             color='green', alpha=0.5, fmt='o', ecolor='black', capsize=5)
plt.xlabel("Coefficient")
plt.ylabel("Variable")
plt.title("Linear Regression for Converted Mana Cost of French Vanilla Creatures")
plt.show()


plt.errorbar([model.params[i] for i in keyword_indices], 
             [X.columns[i] for i in keyword_indices], 
             xerr=[model.bse[i] for i in keyword_indices], 
             color='green', alpha=0.5, fmt='o', ecolor='black', capsize=5)
plt.xlabel("Coefficient")
plt.ylabel("Variable")
plt.title("Linear Regression for Converted Mana Cost of French Vanilla Creatures")
plt.show()


fv_one_hot_df['expected_mana_value'] = fv_one_hot_df.drop(columns=['name', 'cmc', 'ability_text']).apply(expected_mana_cost, axis=1)
fv_one_hot_df['power_level'] = fv_one_hot_df['expected_mana_value'] - per_year * fv_one_hot_df['released_at'] - fv_one_hot_df['cmc']
r = range(max(fv_one_hot_df['released_at']))
plt.plot(r, [statistics.mean(fv_one_hot_df[fv_one_hot_df['released_at'] == i]['power_level']) for i in r])
plt.xlabel("Years since Alpha")
plt.ylabel("Difference in Expected vs Actual Mana Value")
plt.title("Power Level over Time")
plt.show()

	name	mana_cost	cmc	type_line	oracle_text	power	toughness	colors	color_identity	keywords	set	rarity	games	legalities	num_colored_pips	num_colors
32371	Air Elemental	{3}{U}{U}	5	Creature — Elemental	Flying	4	4	[U]	[U]	[Flying]	lea	uncommon	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	2	1
5420	Benalish Hero	{W}	1	Creature — Human Soldier	Banding (Any creatures with banding, and up to...	1	1	[W]	[W]	[Banding]	lea	common	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	1	1
26263	Birds of Paradise	{G}	1	Creature — Bird	Flying\n{T}: Add one mana of any color.	0	1	[G]	[G]	[Flying]	lea	rare	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	1	1
59195	Black Knight	{B}{B}	2	Creature — Human Knight	First strike (This creature deals combat damag...	2	2	[B]	[B]	[First strike, Protection]	lea	uncommon	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	2	1
31516	Bog Wraith	{3}{B}	4	Creature — Wraith	Swampwalk (This creature can't be blocked as l...	3	3	[B]	[B]	[Landwalk, Swampwalk]	lea	uncommon	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	1	1

	name	mana_cost	cmc	type_line	power	toughness	colors	color_identity	keywords	set	rarity	games	legalities	num_colored_pips	num_colors
58740	Craw Wurm	{4}{G}{G}	6	Creature — Wurm	6	4	[G]	[G]	[]	lea	common	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	2	1
54563	Earth Elemental	{3}{R}{R}	5	Creature — Elemental	4	5	[R]	[R]	[]	lea	uncommon	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	2	1
66828	Fire Elemental	{3}{R}{R}	5	Creature — Elemental	5	4	[R]	[R]	[]	lea	uncommon	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	2	1
35350	Gray Ogre	{2}{R}	3	Creature — Ogre	2	2	[R]	[R]	[]	lea	common	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	1	1
63158	Grizzly Bears	{1}{G}	2	Creature — Bear	2	2	[G]	[G]	[]	lea	common	[paper]	{'standard': 'not_legal', 'future': 'not_legal...	1	1

	name	cmc	power	toughness	colors	rarity	num_colored_pips	num_colors
58740	Craw Wurm	6	6	4	[G]	common	2	1
54563	Earth Elemental	5	4	5	[R]	uncommon	2	1
66828	Fire Elemental	5	5	4	[R]	uncommon	2	1
35350	Gray Ogre	3	2	2	[R]	common	1	1
63158	Grizzly Bears	2	2	2	[G]	common	1	1

	name	cmc	power	toughness	num_colored_pips	num_colors	colors_R	colors_G	rarity_common	rarity_uncommon
58740	Craw Wurm	6	6	4	2	1	0	1	1	0
54563	Earth Elemental	5	4	5	2	1	1	0	0	1
66828	Fire Elemental	5	5	4	2	1	1	0	0	1
35350	Gray Ogre	3	2	2	1	1	1	0	1	0
63158	Grizzly Bears	2	2	2	1	1	0	1	1	0

	name	cmc	power	toughness	released_at	num_colored_pips	num_colors	colors_W	colors_R	colors_U	colors_B	colors_G	rarity_common	rarity_rare	rarity_uncommon	expected_mana_value
58740	Craw Wurm	6	6	4	0	2	1	0	0	0	0	1	1	0	0	5.690056
54563	Earth Elemental	5	4	5	0	2	1	0	1	0	0	0	0	0	1	4.993144
66828	Fire Elemental	5	5	4	0	2	1	0	1	0	0	0	0	0	1	5.151129
35350	Gray Ogre	3	2	2	0	1	1	0	1	0	0	0	1	0	0	2.764881
63158	Grizzly Bears	2	2	2	0	1	1	0	0	0	0	1	1	0	0	2.186319
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11291	Highborn Vampire	4	4	3	27	1	1	0	0	0	1	0	1	0	0	3.967672
73427	Murasa Brute	3	3	3	27	1	1	0	0	0	0	1	1	0	0	2.636510
22628	Grizzled Outrider	5	5	5	28	1	1	0	0	0	0	1	1	0	0	5.092426
22694	Ageless Guardian	2	1	4	28	1	1	1	0	0	0	0	1	0	0	2.308297
59796	Spined Karok	3	2	4	28	1	1	0	0	0	0	1	1	0	0	2.449176

An Analysis of Magic: The Gathering's Creatures¶

by Simon Chervenak¶

Vocabulary¶

Data Cleaning¶

Data Processing¶

Vanilla Creature Analysis¶

French Vanilla Analysis¶

Conclusions¶

	object	id	oracle_id	multiverse_ids	mtgo_id	mtgo_foil_id	tcgplayer_id	cardmarket_id	name	lang	...	tcgplayer_etched_id	attraction_lights	color_indicator	life_modifier	hand_modifier	printed_type_line	printed_text	content_warning	flavor_name	variation_of
0	card	0000579f-7b35-4ed3-b44c-db2a538066fe	44623693-51d6-49ad-8cd7-140505caf02f	[109722]	25527.0	25528.0	14240.0	13850.0	Fury Sliver	en	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	card	00006596-1166-4a79-8443-ca9f82e6db4e	8ae3562f-28b7-4462-96ed-be0cf7052ccc	[189637]	34586.0	34587.0	33347.0	21851.0	Kor Outfitter	en	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	card	0000a54c-a511-4925-92dc-01b937f9afad	dc4e2134-f0c2-49aa-9ea3-ebf83af1445c	[]	NaN	NaN	98659.0	NaN	Spirit	en	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	card	0000cd57-91fe-411f-b798-646e965eec37	9f0d82ae-38bf-45d8-8cda-982b6ead1d72	[435231]	65170.0	65171.0	145764.0	301766.0	Siren Lookout	en	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	card	00012bd8-ed68-4978-a22d-f450c8a6e048	5aa12aff-db3c-4be5-822b-3afdf536b33e	[1278]	NaN	NaN	1623.0	5664.0	Web	en	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN