1 简介¶

今天在 kaggle 上看到了一个比较有意思的 Dataset，收录了手游 Pokemon 中 721 只神奇宝贝的基本数据，包括 id，名字(name)，类别(type1)，二级分类(type2)，基本属性(血量:HP, 攻击力:Attack, 防御力:Defense, 魔攻:Special Attack, 魔防:Special Defense, 速度:Speed)。

现参考 kaggle 上的一些文章，做 pokemon 类别对其基本属性影响的分析探究。

2 Pokemon 基本数据概要¶

In [1]:

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

data_df = pd.read_csv("http://airing.ursb.me/data/Pokemon.csv")
data_df.head()

Out[1]:

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False

In [2]:

# 最后两列没有意义，直接删去
data_df = data_df.drop(['Generation', 'Legendary'], 1)
data_df.describe()

Out[2]:

	#	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed
count	800.000000	800.00000	800.000000	800.000000	800.000000	800.000000	800.000000	800.000000
mean	362.813750	435.10250	69.258750	79.001250	73.842500	72.820000	71.902500	68.277500
std	208.343798	119.96304	25.534669	32.457366	31.183501	32.722294	27.828916	29.060474
min	1.000000	180.00000	1.000000	5.000000	5.000000	10.000000	20.000000	5.000000
25%	184.750000	330.00000	50.000000	55.000000	50.000000	49.750000	50.000000	45.000000
50%	364.500000	450.00000	65.000000	75.000000	70.000000	65.000000	70.000000	65.000000
75%	539.250000	515.00000	80.000000	100.000000	90.000000	95.000000	90.000000	90.000000
max	721.000000	780.00000	255.000000	190.000000	230.000000	194.000000	230.000000	180.000000

In [3]:

# 先看看 HP 与 Attack 之间的关联
sns.jointplot(x="HP", y="Attack", data=data_df);
plt.show()

In [4]:

# 首先看看各 Pokemon 的数量分布
sns.boxplot(y="Total", data=data_df)
plt.show()

In [5]:

# id 和 Total 对属性研究无意义，删去
data_df_2 = data_df.drop(['#', 'Total'], 1)
sns.boxplot(data=data_df_2)
plt.show()

In [6]:

var_int = data_df_2.dtypes[data_df.dtypes=='int64'].index
var_int = var_int[1:]
var_int

Out[6]:

Index(['Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], dtype='object')

In [7]:

l_int = len(var_int)
fig = plt.figure(figsize=(13, 8))
for i, val in enumerate(var_int):
    fig.add_subplot(3, 3, i+1)
    plt.hist(data_df_2[val], bins=50)
    plt.title(val)

plt.show()

In [8]:

# 再看看各属性间的相关性
data_df_2.corr()

Out[8]:

	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed
HP	1.000000	0.422386	0.239622	0.362380	0.378718	0.175952
Attack	0.422386	1.000000	0.438687	0.396362	0.263990	0.381240
Defense	0.239622	0.438687	1.000000	0.223549	0.510747	0.015227
Sp. Atk	0.362380	0.396362	0.223549	1.000000	0.506121	0.473018
Sp. Def	0.378718	0.263990	0.510747	0.506121	1.000000	0.259133
Speed	0.175952	0.381240	0.015227	0.473018	0.259133	1.000000

3 探索 Pokemon 类别对其属性的影响¶

In [9]:

# 统计 Pokemon 的类别
type1 = data_df['Type 1'].unique()
print(type1)

data_type1 = data_df.groupby('Type 1').count()['#']
data_type1.sort_values(ascending=False)

['Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Poison' 'Electric' 'Ground' 'Fairy'
 'Fighting' 'Psychic' 'Rock' 'Ghost' 'Ice' 'Dragon' 'Dark' 'Steel' 'Flying']

Out[9]:

Type 1
Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Rock         44
Electric     44
Ground       32
Dragon       32
Ghost        32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
Name: #, dtype: int64

In [10]:

labels = ['Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Electric', 'Rock', 'Other']
sizes = [112, 98, 70, 69, 57, 52, 44, 44, 175]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'yellow', 'lightgreen', 'silver', 'white', 'pink']
explode = (0, 0, 0, 0, 0, 0, 0, 0, 0.1) 

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title("Percentage of Different Types of Pokemon")
plt.show()

In [11]:

# 首先通过箱线图观察各类 Pokemon 的数量分布
type_to_int_dict = { 'Grass': 0, 'Fire': 1, 'Water': 2, 'Bug': 3, 'Normal': 4, 
                    'Poison': 5, 'Electric': 6, 'Ground': 7, 'Fairy': 8, 'Fighting': 9,
                    'Psychic' : 10, 'Rock': 11, 'Ghost':12, 'Ice' : 13, 'Dragon': 14, 
                    'Dark': 15, 'Steel': 16, 'Flying': 17} 
        
data_df['Int_Type1'] = data_df['Type 1'].map(type_to_int_dict).astype(int)

sns.set(style="ticks")
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(ax = ax, x="Int_Type1", y="Total", data=data_df, palette="PRGn")
sns.despine(offset=10, trim=True)
plt.show()

# 可以发现龙类的平均数量是远高出其他水平

In [12]:

data_type1 = pd.melt(data_df_2, id_vars=["Name", "Type 1", "Type 2"], var_name="Stat")
data_type1.head()

Out[12]:

	Name	Type 1	Type 2	Stat	value
0	Bulbasaur	Grass	Poison	HP	45
1	Ivysaur	Grass	Poison	HP	60
2	Venusaur	Grass	Poison	HP	80
3	VenusaurMega Venusaur	Grass	Poison	HP	80
4	Charmander	Fire	NaN	HP	39

In [13]:

plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x="Stat", y="value", data=data_type1, hue="Type 1", split=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
plt.show()

In [14]:

# 更加直观地通过箱线图观察各类 Pokemon 的属性数值
fig = plt.figure(figsize=(13,24))
for i, col in enumerate(var_int[:6]):
    ax1 = fig.add_subplot(6, 1, i + 1)
    sns.boxplot(x=data_df['Type 1'], y=data_df_2[col], ax=ax1)       

plt.show()

In [15]:

# 可以发现龙类的 Pokemon 攻击力最高，钢铁类的 Pokemon 防御力最强，飞行类的 Pokemon 速度最快。
# 箱线图展示了分位数的位置，小提琴图则展示了任意位置的密度。
# 这里我们再用小提琴图展示上列数据，会更加直观。

# distribution of HP among all types of pokemon
hp_data = data_df[['Name','Type 1','HP']]
hp_data = hp_data.pivot_table(values = 'HP',index = ['Name'],  columns = ['Type 1'])
hp_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=hp_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("HP of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()

In [17]:

# distribution of Attack among all types of pokemon
attack_data = data_df[['Name','Type 1','Attack']]
attack_data = attack_data.pivot_table(values = 'Attack',index = ['Name'],  columns = ['Type 1'])
attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()

In [18]:

# distribution of Defense among all types of pokemon
defense_data = data_df[['Name','Type 1','Defense']]
defense_data = defense_data.pivot_table(values = 'Defense',index = ['Name'],  columns = ['Type 1'])
defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()

In [22]:

# distribution of Sp.Attack among all types of pokemon
sp_attack_data = data_df[['Name','Type 1','Sp. Atk']]
sp_attack_data = sp_attack_data.pivot_table(values = 'Sp. Atk',index = ['Name'],  columns = ['Type 1'])
sp_attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()

In [23]:

# distribution of Sp.Defense among all types of pokemon
sp_defense_data = data_df[['Name','Type 1','Sp. Def']]
sp_defense_data = sp_defense_data.pivot_table(values = 'Sp. Def',index = ['Name'],  columns = ['Type 1'])
sp_defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()

In [24]:

# distribution of Speed among all types of pokemon
speed_data = data_df[['Name','Type 1','Speed']]
speed_data = speed_data.pivot_table(values = 'Speed',index = ['Name'],  columns = ['Type 1'])
speed_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=speed_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Speed of Different Types of Pokemon")
sns.despine(left=True, bottom=True)

plt.show()