1 简介¶
今天在 kaggle 上看到了一个比较有意思的 Dataset,收录了手游 Pokemon 中 721 只神奇宝贝的基本数据,包括 id,名字(name),类别(type1),二级分类(type2),基本属性(血量:HP, 攻击力:Attack, 防御力:Defense, 魔攻:Special Attack, 魔防:Special Defense, 速度:Speed)。
现参考 kaggle 上的一些文章,做 pokemon 类别对其基本属性影响的分析探究。
2 Pokemon 基本数据概要¶
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data_df = pd.read_csv("http://airing.ursb.me/data/Pokemon.csv")
data_df.head()
Out[1]:
In [2]:
# 最后两列没有意义,直接删去
data_df = data_df.drop(['Generation', 'Legendary'], 1)
data_df.describe()
Out[2]:
In [3]:
# 先看看 HP 与 Attack 之间的关联
sns.jointplot(x="HP", y="Attack", data=data_df);
plt.show()
In [4]:
# 首先看看各 Pokemon 的数量分布
sns.boxplot(y="Total", data=data_df)
plt.show()
In [5]:
# id 和 Total 对属性研究无意义,删去
data_df_2 = data_df.drop(['#', 'Total'], 1)
sns.boxplot(data=data_df_2)
plt.show()
In [6]:
var_int = data_df_2.dtypes[data_df.dtypes=='int64'].index
var_int = var_int[1:]
var_int
Out[6]:
In [7]:
l_int = len(var_int)
fig = plt.figure(figsize=(13, 8))
for i, val in enumerate(var_int):
fig.add_subplot(3, 3, i+1)
plt.hist(data_df_2[val], bins=50)
plt.title(val)
plt.show()
In [8]:
# 再看看各属性间的相关性
data_df_2.corr()
Out[8]:
3 探索 Pokemon 类别对其属性的影响¶
In [9]:
# 统计 Pokemon 的类别
type1 = data_df['Type 1'].unique()
print(type1)
data_type1 = data_df.groupby('Type 1').count()['#']
data_type1.sort_values(ascending=False)
Out[9]:
In [10]:
labels = ['Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Electric', 'Rock', 'Other']
sizes = [112, 98, 70, 69, 57, 52, 44, 44, 175]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'yellow', 'lightgreen', 'silver', 'white', 'pink']
explode = (0, 0, 0, 0, 0, 0, 0, 0, 0.1)
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title("Percentage of Different Types of Pokemon")
plt.show()
In [11]:
# 首先通过箱线图观察各类 Pokemon 的数量分布
type_to_int_dict = { 'Grass': 0, 'Fire': 1, 'Water': 2, 'Bug': 3, 'Normal': 4,
'Poison': 5, 'Electric': 6, 'Ground': 7, 'Fairy': 8, 'Fighting': 9,
'Psychic' : 10, 'Rock': 11, 'Ghost':12, 'Ice' : 13, 'Dragon': 14,
'Dark': 15, 'Steel': 16, 'Flying': 17}
data_df['Int_Type1'] = data_df['Type 1'].map(type_to_int_dict).astype(int)
sns.set(style="ticks")
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(ax = ax, x="Int_Type1", y="Total", data=data_df, palette="PRGn")
sns.despine(offset=10, trim=True)
plt.show()
# 可以发现龙类的平均数量是远高出其他水平
In [12]:
data_type1 = pd.melt(data_df_2, id_vars=["Name", "Type 1", "Type 2"], var_name="Stat")
data_type1.head()
Out[12]:
In [13]:
plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x="Stat", y="value", data=data_type1, hue="Type 1", split=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
plt.show()
In [14]:
# 更加直观地通过箱线图观察各类 Pokemon 的属性数值
fig = plt.figure(figsize=(13,24))
for i, col in enumerate(var_int[:6]):
ax1 = fig.add_subplot(6, 1, i + 1)
sns.boxplot(x=data_df['Type 1'], y=data_df_2[col], ax=ax1)
plt.show()
In [15]:
# 可以发现龙类的 Pokemon 攻击力最高,钢铁类的 Pokemon 防御力最强,飞行类的 Pokemon 速度最快。
# 箱线图展示了分位数的位置,小提琴图则展示了任意位置的密度。
# 这里我们再用小提琴图展示上列数据,会更加直观。
# distribution of HP among all types of pokemon
hp_data = data_df[['Name','Type 1','HP']]
hp_data = hp_data.pivot_table(values = 'HP',index = ['Name'], columns = ['Type 1'])
hp_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=hp_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("HP of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()
In [17]:
# distribution of Attack among all types of pokemon
attack_data = data_df[['Name','Type 1','Attack']]
attack_data = attack_data.pivot_table(values = 'Attack',index = ['Name'], columns = ['Type 1'])
attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()
In [18]:
# distribution of Defense among all types of pokemon
defense_data = data_df[['Name','Type 1','Defense']]
defense_data = defense_data.pivot_table(values = 'Defense',index = ['Name'], columns = ['Type 1'])
defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()
In [22]:
# distribution of Sp.Attack among all types of pokemon
sp_attack_data = data_df[['Name','Type 1','Sp. Atk']]
sp_attack_data = sp_attack_data.pivot_table(values = 'Sp. Atk',index = ['Name'], columns = ['Type 1'])
sp_attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()
In [23]:
# distribution of Sp.Defense among all types of pokemon
sp_defense_data = data_df[['Name','Type 1','Sp. Def']]
sp_defense_data = sp_defense_data.pivot_table(values = 'Sp. Def',index = ['Name'], columns = ['Type 1'])
sp_defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()
In [24]:
# distribution of Speed among all types of pokemon
speed_data = data_df[['Name','Type 1','Speed']]
speed_data = speed_data.pivot_table(values = 'Speed',index = ['Name'], columns = ['Type 1'])
speed_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=speed_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Speed of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()